<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.1//EN" 
[
<!ENTITY index SYSTEM      "rtindex.xml">
<!ENTITY ada      "<acronym>Ada</acronym>">
<!ENTITY adeos    "<acronym>Adeos</acronym>">
<!ENTITY api      "<acronym>API</acronym>">
<!ENTITY asr      "<acronym>ASR</acronym>">
<!ENTITY beos     "<acronym>BeOS</acronym>">
<!ENTITY ccc      "<acronym>C</acronym>">
<!ENTITY comedi   "<acronym>Comedi</acronym>">
<!ENTITY corba    "<acronym>CORBA</acronym>">
<!ENTITY cpp      "<acronym>C++</acronym>">
<!ENTITY cpu      "<acronym>CPU</acronym>">
<!ENTITY cubeos   "<acronym>CubeOS</acronym>">
<!ENTITY dcom     "<acronym>DCOM</acronym>">
<!ENTITY drops    "<acronym>DROPS</acronym>">
<!ENTITY dsr      "<acronym>DSR</acronym>">
<!ENTITY ecos     "<acronym>eCos</acronym>">
<!ENTITY eos      "<acronym>EOS</acronym>">
<!ENTITY elix     "<acronym>EL/IX</acronym>">
<!ENTITY etlinux  "<acronym>Etlinux</acronym>">
<!ENTITY fiasco   "<acronym>FIASCO</acronym>">
<!ENTITY fifo     "<acronym>FIFO</acronym>">
<!ENTITY fsmlabs  "<acronym>FSMLabs</acronym>">
<!ENTITY gnu      "<acronym>GNU</acronym>">
<!ENTITY gnuhurd  "<acronym>GNU/Hurd</acronym>">
<!ENTITY gnulinux "<acronym>GNU/Linux</acronym>">
<!ENTITY gpl      "<acronym>GPL</acronym>">
<!ENTITY ipc      "<acronym>IPC</acronym>">
<!ENTITY irq      "<acronym>IRQ</acronym>">
<!ENTITY isr      "<acronym>ISR</acronym>">
<!ENTITY java     "<acronym>Java</acronym>">
<!ENTITY kurt     "<acronym>KURT</acronym>">
<!ENTITY ldd      "<application>Linux Device Drivers</application>">
<!ENTITY lgpl     "<acronym>LGPL</acronym>">
<!ENTITY linux    "<acronym>Linux</acronym>">
<!ENTITY lilo     "<acronym>LILO</acronym>">
<!ENTITY ltt      "<acronym>Linux Trace Toolkit</acronym>">
<!ENTITY lxrt     "<acronym>LX/RT</acronym>">
<!ENTITY mmu      "<acronym>MMU</acronym>">
<!ENTITY nt       "<acronym>Microsoft NT</acronym>">
<!ENTITY orb      "<acronym>ORB</acronym>">
<!ENTITY osek     "<acronym>OSEK</acronym>">
<!ENTITY pci      "<acronym>PCI</acronym>">
<!ENTITY posix    "<acronym>POSIX</acronym>">
<!ENTITY qos      "<acronym>QoS</acronym>">
<!ENTITY qnx      "<acronym>QNX</acronym>">
<!ENTITY rpc      "<acronym>RPC</acronym>">
<!ENTITY redhat   "<acronym>RedHat</acronym>">
<!ENTITY rtai     "<acronym>RTAI</acronym>">
<!ENTITY rtcorba  "<acronym>RT-CORBA</acronym>">
<!ENTITY rtems    "<acronym>RT-EMS</acronym>">
<!ENTITY rtlinux  "<acronym>RTLinux</acronym>">
<!ENTITY rtos     "<acronym>RTOS</acronym>">
<!ENTITY schedf   "<function>schedule()</function>">
<!ENTITY smp      "<acronym>SMP</acronym>">
<!ENTITY simpl    "<acronym>SIMPL</acronym>">
<!ENTITY uclinux  "<acronym>uCLinux</acronym>">
<!ENTITY unix     "<acronym>UNIX</acronym>">
<!ENTITY uitron   "<acronym>&mu;ITRON</acronym>">
<!ENTITY unix98   "<acronym>UNIX98</acronym>">
<!ENTITY up       "<acronym>UP</acronym>">
<!ENTITY vxworks  "<acronym>VxWorks</acronym>">
<!ENTITY win95    "<acronym>Microsoft Windows 95</acronym>">
]>

<book id="index">

<?dbhtml filename="rtGuide.html"?>


 <bookinfo>
 <title> Real-Time and Embedded Guide </title>
  <authorgroup>
   <author>
    <firstname>Herman</firstname> <surname>Bruyninckx</surname>
    <affiliation>
      <orgname>K.U.Leuven, Mechanical Engineering</orgname>
<!--      <orgdiv>Department of Mechanical Engineering</orgdiv>
-->
      <address>
       <city>Leuven</city>
       <country>Belgium</country>
       <email>Herman.Bruyninckx@mech.kuleuven.ac.be</email>
      </address>
    </affiliation>
   </author>
 </authorgroup>
 <copyright>
  <year>2000, 2001, 2002</year>
  <holder>Herman.Bruyninckx@mech.kuleuven.ac.be</holder>
 </copyright>

 <abstract>
 <para>
This Guide covers the fundamentals of (i) real-time and embedded operating
systems (focusing mostly on the differences with general purpose
operating systems such as &linux;), and (ii) real-time programming.
The emphasis is on Free Software and Open Source Software examples:
&rtai;, &rtlinux;, &ecos;, &rtems;, &uclinux;, &hellip;, with a more
than proportional focus on &rtai;.
</para>
<para>
This text also talks about design issues, software patterns and
frameworks for real-time applications. That is, the
&ldquo;high-level&rdquo; aspects of these software projects. These
higher levels are often poorly dealt with in publications on real-time
programming, which leads to the unfortunate situation that still too
many real-time programmers use <emphasis>only</emphasis> the
powerful but dangerously unstructured &api; of their &rtos;. Missing
the chance to develop more structured, and, hence, more deterministic
and more portable software systems.
</para>
<para>
Both the low-level &rtos; primitives, and the high-level design
issues, are illustrated by the real-world example of a hard real-time
core for feedback control and signal processing.
</para>
 </abstract>

 <revhistory>
  <revision>
    <revnumber>0.01</revnumber>
    <date>Aug 31, 2000</date>
    <authorinitials>hb</authorinitials>
    <revremark>Initial draft</revremark>
  </revision>
  <revision>
    <revnumber>0.02</revnumber>
    <date>Sep 30, 2000</date>
    <authorinitials>hb</authorinitials>
    <revremark>Added: more info about signals</revremark>
  </revision>
  <revision>
    <revnumber>0.03</revnumber>
    <date>Sep 20, 2001</date>
    <authorinitials>hb</authorinitials>
    <revremark>
Removed: empty hardware, user space GUI and FAQ sections.
Added: Software Patterns
    </revremark>
  </revision>
  <revision>
    <revnumber>0.04-build-20021211-1936</revnumber>
    <date>Dec., 11 2002</date>
    <authorinitials>hb</authorinitials>
    <revremark>
Extended and heavily reworked version. Preparing for pre-release.
    </revremark>
  </revision>
 </revhistory>

 <legalnotice>
  <para>
Permission is granted to copy, distribute and/or modify this document
under the terms of the GNU Free Documentation License, Version 1.1 or
any later version published by the Free Software Foundation, with no
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
Texts. A copy of this license can be found at 
<ulink
 url="http://www.fsf.org/copyleft/fdl.html">http://www.fsf.org/copyleft/fdl.html</ulink>.
  </para>
 </legalnotice>

</bookinfo>

<preface id="aboutthisguide">
<title>About this Guide</title>

<sect1 id="purpose">
<title>Purpose and scope</title>
<para>
This Guide consist of several parts: Part&nbsp;1 provides a
<emphasis>top-down overview</emphasis> of real-time and embedded
operating systems, up to a more detailed description of the features
and implementation of a &ldquo;typical&rdquo; &rtos;, i.e., &rtai;;
Part&nbsp;2 gives more details about implementation of real-time
functionality.
Part&nbsp;3 introduces some time-proven design solutions to common
problems in real-time programming, as well as a list of design and
programming hints, to help readers gain time and reliability in their
designs and implementations. 
</para>

<para>
The top-down view on real-time and embedded operating systems
is complementary to the typical <emphasis>bottom-up</emphasis>
&ldquo;show-me-the-code&rdquo; and &ldquo;bazaar&rdquo; approach of
development and documentation writing in the free software world. Not
that there is something wrong with this approach, but this Guide's
goal is different: it wants to make it easier for newcomers to grasp
the basic concepts and techniques behind real-time operating systems,
and to help them see the forest for the trees, without having to go
and read the code in a ton of different files. Nevertheless: the
source code of the presented projects remains the
<emphasis>only</emphasis> complete and up-to-date documentation.
</para>

<para>
The document tries to be as independent as possible of any
particular implementation or project: so, the concepts introduced in
the theoretical part of the text are not necessarily available in each
and every concrete example given in the text.  Moreover, this Guide is
not meant to be an exhaustive textbook on real-time programming, so
the reader is urged to go and read some operating system textbooks,
and other &ldquo;low-level&rdquo; books such as the
<ulink
 url="http://www.xml.com/ldd/chapter/book/index.html">Linux Device
Drivers
</ulink>
book, for missing details. The reader should be familiar with the
basics of operating systems, and be able to read &ccc; code.
</para>

<para>
This guide first explains the general principles behind real-time and
embedded operating systems. These principles are not too different
from general operating systems such as &linux; or &nt;, but the
emphasis lies on <emphasis>maximum determinism</emphasis>, and not on
<emphasis>maximum average throughput</emphasis>.
Because determinism is often compromised in &ldquo;high-level&rdquo;
programming language and operating system constructs, real-time
designers are confronted more directly than &ldquo;normal&rdquo;
application developers with concepts, timing, memory, and efficiency
at the level of the operating system.
</para>

<para>
Another primary goal of this Guide is educational: it could over time
evolve into classroom notes, featuring demos, annotated code, more
graphical illustrations, and more examples of good design and code. 
Whether it will reach these educational goals depends on
<emphasis>your</emphasis> contributions, as critical reader of the
text, looking out for opportunities to help improve the free
documentation available on your favourite free software
project&hellip;
</para>

</sect1>


<sect1 id="feedback">
<title>Feedback</title>
<para>
This Guide still has a number of paragraphs marked with
<emphasis>&ldquo;TODO&rdquo;</emphasis>, signalling parts that are
still to be filled in with more details, or where the current
treatment needs more thought and/or information.
</para>
<para>
Please direct all questions, additions or suggestions for changes to
<email>Herman.Bruyninckx@mech.kuleuven.ac.be</email>.
</para>

</sect1>


<sect1 id="copyright">
<title>Copyrights, Licenses and Trademarks</title>

<para>
Permission is granted to copy, distribute and/or modify this document under
the terms of the <emphasis>GNU Free Documentation License</emphasis>
(<acronym>FDL</acronym>).  Version 1.1 or any later version published by
the Free Software Foundation. A copy of this license can be found  
<ulink url="http://www.fsf.org/copyleft/fdl.html">here</ulink>.
</para>
<para>
&linux; is a trademark of Linus Torvalds. 
&rtlinux; is a trademark of VJY Associates LLC of &rtlinux;'s creators 
<ulink url="http://www.linuxdevices.com/articles/AT2238037882.html">Victor
Yodaiken</ulink> and Michael Barabanov; they released &rtlinux; under the
<ulink url="http://www.fsf.org/copyleft/gpl.html">&gpl; license</ulink>.
&rtai; was first released under the
<ulink
url="http://www.fsf.org/copyleft/lesser.html">&lgpl;</ulink>
license by Paolo Mantegazza, but, later, core components got a &gpl; license.
&ecos;
is released under the 
<ulink url="http://www.redhat.com/embedded/technologies/ecos/ecoslicense.html">
Red Hat eCos Public License</ulink>, but also got the &gpl; license
later on.
<application>Real Time Scheduler</application> by
<ulink url="http://www.mvista.com">Montavista Software, Inc.</ulink>
is released under the &gpl; license.
&rtems; by
<ulink url="http://www.rtems.com/">On-line Applications Research</ulink>
(<acronym>OAR</acronym>)
is released under the &gpl; license.
&kurt; from the
<ulink url="http://www.ittc.ukans.edu/kurt/">University of Kansas Center for
Research, Inc.</ulink> is released under the &gpl; license.
&uclinux; from the
<ulink url="http://www.uclinux.org">Embedded Linux/Microcontroller 
Project</ulink> is released under the &gpl; license.
The <ulink url="http://www.opersys.com/LTT/">Linux Trace Toolkit</ulink>
is released under the &gpl; license by Karim
Yaghmour.
David Schleef released <ulink url="http://stm.lbl.gov/comedi/">Comedi</ulink>
under the &gpl; license.
Karim Yaghmour and Philippe Gerum released
<ulink url="http://www.opersys.com/adeos/">Adeos</ulink>
under the &gpl; license.
</para>
</sect1>


<sect1 id="acknowledgments">
<title>Acknowledgements</title>
<para>
Large parts of this document were written with financial support from
the Flemish
<emphasis>Fonds voor Wetenschappelijk Onderzoek (FWO)</emphasis>, and
the <emphasis>Katholieke Universiteit Leuven</emphasis>, Belgium.
The hospitality offered by Prof. Henrik Christensen of
<emphasis>Kungliga Tekniska H&ouml;gskolan (KTH)</emphasis> in
Stockholm, where other parts of this document were written, is
gratefully acknowledged.
</para>
<para>
The style for this Guide was originally copied from the 
<ulink
 url="http://www.linuxdoc.org/LDP/LDP-Author-Guide/">LDP Author Guide
</ulink> written by Mark F. Komarinski and Jorge Godoy. It used the
<application>ldp.dsl</application> <acronym>SGML</acronym> stylesheet
from the <ulink url="http://www.linuxdoc.org">Linux Documentation
Project</ulink>. The current style is <emphasis>DocBook</emphasis>.
</para>

<para>
&linux; and the indispensable GNU libraries and tools are wonderful gifts
from Linus Torvalds, the people at the
<ulink url="http://www.fsf.org">Free Software Foundation</ulink>, and
thousands of others.
While, in general, <emphasis>&gnulinux;</emphasis> is the
appropriate name to denote
the popular free software operating system, this text usually talks
about &linux;, because the <emphasis>kernel</emphasis> is the topic of
this document.  The text also uses the term
<emphasis>free software</emphasis> as a general name for software
released under licences approved by both the
<ulink url="http://www.fsf.org/philosophy/license-list.html">Free
Software Foundation</ulink>
and the
<ulink url="http://www.opensource.org/licenses/">Open Source
Initiative</ulink>.
</para>

<para>
The <ulink url="http://www.rtlinux.com">&rtlinux;</ulink> real-time
extensions to &linux; were originally created by Victor Yodaiken
and Michael Barabanov. Paolo Mantegazza created the
<ulink url="http://www.rtai.org"><application>Real Time Application
Interface</application></ulink> (&rtai;) real-time
extensions. Karim Yaghmour come up with the design of an alternative
approach towards building a real-time nano-kernel underneath &linux;
(or any operating system kernel for that matter); this design was
implemented in the &adeos; project by Philippe Gerum.
This text's discussion on real-time device drivers is
much inspired by David Schleef's design for 
<ulink url="http://stm.lbl.gov/comedi/">&comedi;</ulink>.
</para>

<para>
Klaas Gadeyne and Patrice Kadionik gave valuable feedback on the first
draft. Error feedback was also received from Werner Spielmann, 
Gregory Matus, and Oscar Esteban.
Paolo Mantegazza and Philippe Gerum helped to clarify some
&rtai; internals. Many thanks also go to Peter Soetens of the 
<ulink url="http://www.orocos.org">Orocos project</ulink>,
and the many critical students and summer interns whose questions
stimulated me to look deeper into all those things that I thought I
understood but didn't.
</para>

</sect1>

</preface>

<toc></toc>

<!-- =====================P=A=R=T==I============================= -->

<part id="part1">
<title>Operating system basics</title>

<partintro>
<para> 
This Part introduces the concepts and primitives with which general
purpose as well as real-time and embedded operating systems are built.
The text dicusses the applicability and appropriateness of all these
concepts in real-time and embedded operating systems.
</para> 
<para> 
(TODO: more annotated code examples.)
</para> 
</partintro>


<chapter id="realtime">
<title>Real-time and embedded operating systems</title>
<para>
This Chapter discusses the basics of operating systems in general, and
real-time and embedded operating systems in particular.  (This text
uses the abbreviations OS, &rtos; and &eos;,
respectively.) This discussion makes clear why standard &linux;
doesn't qualify as a real-time OS, nor as an embedded OS.
</para>

<para>
Real-time and embedded operating systems are in most respects similar
to general purpose operating systems: they provide the interface
between application programs and the system hardware, and they rely on
basically the same set of programming primitives and concepts. But
general purpose operating systems make different trade-offs in
<emphasis>applying</emphasis> these primitives, because they have
different goals.
</para>

<sect1 id="sect-os">
<title>OS responsibilities</title>
<para>
This Section discusses the <anchor id="rtos-respon">basic
responsibilities of the operating system that are relevant for this
text: (i) task management and scheduling, (ii) (deferred) interrupt
servicing, (iii) inter-process communication and synchronization, and
(iv) memory management. General-purpose operating systems also have
other responsibilities, which are beyond the horizon of a
<emphasis>real-time</emphasis> operating system: file systems and file
management, (graphical) user interaction, communication protocol
stacks, disk IO, to name a few.
More details about the relevant responsibilities are given in the
following Chapters.
</para>


<sect2 id="rtos-task-management">
<title>Task management and scheduling</title>
<para>
<emphasis>Task (or &ldquo;process&rdquo;, or &ldquo;thread&rdquo;)
management</emphasis> is a primary job of the operating system: tasks
must be created and deleted while the system is running; tasks can
change their priority levels, their timing constraints, their memory
needs; etcetera. Task management for an &rtos; is a bit more
dangerous than for a general purpose OS: if a real-time task is
created, it <emphasis>has</emphasis> to get the memory it needs
without delay, and that memory <emphasis>has</emphasis> to be locked
in main memory in order to avoid access latencies due to swapping;
changing run-time priorities influences the run-time behaviour of the
whole system and hence also the predictability which is so important
for an &rtos;.
So, dynamic process management is a potential headache for an &rtos;.
<xref linkend="task-management"> gives more details.
</para>

<para>
In general, multiple tasks will be active at the same time, and the OS
is responsible for sharing the available resources (&cpu; time, memory,
etc.) over the tasks. The &cpu; is one of the important resources, and
deciding how to share the &cpu; over the tasks is called
&ldquo;scheduling&rdquo;. 
</para>

<para>
The general trade-off made in scheduling algorithms is between, on the
one hand, the <emphasis>simplicity</emphasis> (and hence efficiency)
of the algorithm, and, on the other hand, its
<emphasis>optimality</emphasis>. (Note that various optimality
criterions exist!) Algorithms that want to be globally optimal are
usually quite complex, and/or require knowledge about a large number
of task parameters, that are often not straightforward to find on line
(e.g., the duration of the next run of a specific task; the time
instants when sleeping tasks will become ready to run; etc.).
Real-time and embedded operating systems favour simple scheduling
algorithms, because these take a small and deterministic amount of
computing time, and require little memory footprint for their code.
</para>

<para>
General purpose and real-time operating systems differ considerably in
their scheduling algorithms. They use the same basic principles, but
apply them differently because they have to satisfy different
performance criterions. A general purpose OS aims at maximum
<emphasis>average</emphasis> throughput, a real-time OS aims at
<emphasis>deterministic</emphasis> behaviour, and an embedded OS wants
to keep memory footprint and power consumption low.  A large variety
of &ldquo;real-time&rdquo; scheduling algorithms exists, but some are
standard in most real-time operating systems
(see <xref linkend="prior-sched">):
<emphasis>static priority scheduling</emphasis>,
<emphasis>earliest deadline first (EDF)</emphasis>, and
<emphasis>rate-monotonic scheduling</emphasis>.
</para>

</sect2>


<sect2 id="rtos-interrupt-servicing">
<title>Interrupt servicing</title>
<para>
An operating system must not only be able to schedule tasks according
to a deterministic algorithm, but it also has to service
peripheral hardware, such as timers, motors, sensors, communication
devices, disks, etc. All of those can request the attention of the OS
<emphasis>asynchronously</emphasis>, i.e., at the time that
<emphasis>they</emphasis> want to use the OS services, the OS has
to make sure it is ready to service the requests. Such a
request for attention is often signaled by means of an
<emphasis>interrupt</emphasis>. There are two kinds of interrupts: 
<itemizedlist>
 <listitem>
 <para>
<emphasis>Hardware interrupt.<indexterm>
<primary>hardware interrupt</primary></indexterm><indexterm>
<primary>interrupt</primary><secondary>hardware</secondary>
</indexterm></emphasis>
The peripheral device can put a bit on a particular hardware channel
that triggers the processor(s) on which the OS runs, to signal that
the device needs servicing. The result of this trigger is that the
processor saves its current state, and jumps to an address in its
memory space, that has been connected to the hardware interrupt at
initialisation time.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Software interrupt.<indexterm>
<primary>software interrupt</primary></indexterm><indexterm>
<primary>interrupt</primary><secondary>software</secondary></indexterm>
</emphasis>
Many processors have built-in software instructions with which the
effect of an hardware interrupt can be generated in software.
The result of a software interrupt is
also a triggering of the processor, so that it jumps to a
pre-specified address.
 </para>
 </listitem>
</itemizedlist>
</para>

<para>
The operating system is, in principle, not involved in the execution
of the code triggered by the hardware interrupt: this is taken care of
by the &cpu; without software interference. The OS, however, does
have influence on (i) connecting a memory address to every interrupt
line, and (ii) what has to be done <emphasis>immediately
after</emphasis> the interrupt has been serviced, i.e., how
<emphasis>&ldquo;deferred interrupt servicing&rdquo;</emphasis> is to
be handled. 
Obviously, real-time operating systems have a specific approach
towards working with interrupts, because they are a primary means to
guarantee that tasks gets serviced deterministically.
<xref linkend="interrupts"> gives more details.
</para>

</sect2>


<sect2 id="rtos-ipc">
<title>Communication and synchronization</title>
<para>
A third responsibility of an OS is commonly known under the name of
<emphasis>Inter-Process Communication</emphasis> (&ipc;).
(&ldquo;Process&rdquo; is, in this context, just another name for
&ldquo;task&rdquo;.)
The general name &ipc; collects a large set of programming primitives
that the operating system makes available to tasks that need to
exchange information with other tasks, or synchronize
their actions. Again, an &rtos; has to make sure that this communication
and synchronization take place in a deterministic way. 
<xref linkend="ipc-synch"> gives more details.
</para>

<para>
Besides communication and synchronization with other tasks that run on
the same computer, some tasks also need to talk to other computers, or
to peripheral hardware (such as analog input or output cards).  This
involves some peripheral hardware, such as a serial line or a network,
and special purpose device drivers (<xref linkend="devicedriver">).
</para>

</sect2>


<sect2 id="rtos-mem-management">
<title>Memory management</title>
<para>
A fourth responsibility of the OS is <emphasis>memory
management</emphasis>: the different tasks in the system all require part
of the available memory, often to be placed on specified hardware addresses
(for memory-mapped IO). The job of the OS then is (i) to give each task
the memory it needs (<emphasis>memory allocation</emphasis>), 
(ii) to map the real memory onto the address ranges used in the
different tasks (<emphasis>memory mapping</emphasis>),
and
(iii) to take the appropriate action when a task uses memory
that it has not allocated. (Common causes are: unconnected pointers
and array indexing beyond the bounds of the array.) This is the so-called 
<emphasis>memory<anchor id="memprot"> protection</emphasis> feature of the
OS. Of course, what exactly the &ldquo;appropriate action&rdquo; should be
depends on the application; often it boils down to the simplest solution:
killing the task and notifying the user.
<xref linkend="memory-management"> gives more details.
</para>
</sect2>
</sect1>


<sect1 id="arch">
<title>Trade-offs</title>

<para>
This Section discusses some of the trade-offs that (both, general
purpose, and real-time and embedded) operating system designers
commonly make.
<itemizedlist>

 <listitem>
 <para>
<emphasis>Kernel space versus user space versus real-time space</emphasis>.
 </para>

 <para>
Most modern processors allow programs to run in two different
hardware protection levels. &linux; calls these two levels
<emphasis>kernel space</emphasis> and <emphasis>user space</emphasis>.
The latter have more protection against erroneous accesses to physical
memory of I/O devices, but access most of the hardware with larger
latencies than kernels space tasks. The real-time &linux; variants add
a third layer, the <emphasis>real-time space</emphasis>. This is in
fact nothing else but a part of kernel space used, but used in a
particular way.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Monolithic kernel versus micro-kernel</emphasis>.
 </para>

 <para>
A monolithic kernel<indexterm>
<primary>monolithic kernel</primary></indexterm>
<indexterm>
<primary>kernel</primary><secondary>monolithic</secondary></indexterm>
has al OS services (including device drivers, network stacks, file
systems, etc.) running within the <emphasis>privileged mode<indexterm>
<primary>privileged mode</primary></indexterm></emphasis> of the
processor.
(This doesn't mean that the whole kernel is one single &ccc; file!)
A micro-kernel,<indexterm>
<primary>micro-kernel</primary></indexterm>
<indexterm> 
<primary>kernel</primary><secondary>micro</secondary></indexterm>
on the other hand, uses the privileged mode only for really core
services (task management and scheduling, interprocess communication,
interrupt handling, and memory management), and has most of the
device drivers and OS services running as &ldquo;normal&rdquo; tasks.
The trade-off between both is as follows: a monolithic kernel is
easier to make more efficient (because OS services can run completely
without switches from privileged to non-privileged mode), but a
micro-kernel is more difficult to crash (an error in a device driver
that doesn't run in privileged mode is less likely to cause a system
halt than an error occurring in privileged mode).
 </para>

 <para>
&unix;, &linux; and &nt; have monolithic kernels;
&qnx;,<indexterm><primary>&qnx;</primary></indexterm>
&fiasco;,
&vxworks;,<indexterm><primary>&vxworks;</primary></indexterm>
and
&gnuhurd; have micro-kernels. &linux;, as well as some commercial
&unix; systems, allow to dynamically or statically change the number
of services in the kernel: extra functionality is added by loading a
<emphasis>module</emphasis>. But the loaded functionality becomes part
of the monolithic kernel. A minimal &linux; kernel (which includes
memory management, task switching and timer services) is some
hundreds of kilobytes big; this approaches the footprint for embedded
systems. However, more and more embedded systems have footprints of
more than a megabyte, because they also require network
stacks and various communication functionalities.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Pre-emptable kernel or not</emphasis>.
 </para>

 <para>
&linux; was originally a non-pre-emptable kernel: a kernel space task
cannot be interrupted by other kernel space tasks, or by user space
tasks. The kernel is &ldquo;locked&rdquo; as long as one kernel
function is executing. This usage of locks 
(<xref linkend="sect-locks">) makes the design of the
kernel simpler, but introduces indeterministic latencies which are not
tolerable in an &rtos;. 
 </para>

 <para>
In the 2.5 kernel series, &linux; gets a more and more fine-grained
kernel locking mechanism, <emphasis>and</emphasis> has become to a
large extent pre-emptable. (See <xref linkend="linux-preempt">.)
&linux; still has one &ldquo;Big Kernel Lock<indexterm> <primary>Big
Kernel Lock</primary></indexterm> (BKL<indexterm>
<primary>BKL</primary></indexterm>),&rdquo; called
<parameter>kernel_flag</parameter> in the &linux; source code, but now
independent subsystems (networking, disk IO, etc.) get their own sets
of locks.
 </para>

 </listitem>

 <listitem>
 <para>
<emphasis>Scalability</emphasis>.
 </para>
 <para>
Finer-grained locking is good for <emphasis>scalability<indexterm>
<primary>scalability</primary></indexterm></emphasis>, but usually an
overhead for single-CPU systems. <emphasis>Solaris<indexterm>
<primary>Solaris</primary></indexterm></emphasis> is an example of
a very fine-grained and scalable operating system, which performs
worse on &ldquo;low-end&rdquo; PCs. The
<ulink url="http://sourceforge.net/projects/lse/">Linux Scalability
Effort<indexterm><primary>Linux Scalability Effort</primary>
</indexterm></ulink> project has more information about the ongoing
activities in this area, as far as the &linux; kernel is concerned.
 </para>
 <para>
Scalability is <emphasis>much less</emphasis> of an issue in
<emphasis>real-time</emphasis> applications, because the goals are so
differen: the desire behind scalable systems is to divide a large work
load transparantly over a number of available &cpu;s, while the desire
behind real-time systems is have everything controlled in a strictly
deterministic way. 
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Memory management versus shared memory</emphasis>.
 </para>

 <para>
Virtual memory and dynamic allocation and de-allocation of
memory pages are amongst the most commonly used memory management
services of a general purpose operating system. However, this memory
management induces overhead, <emphasis>and</emphasis> some simpler
processors have no support for this memory management. On these
processors (which power an enormous number of embedded systems!), all
tasks share the same memory space, such that developers must take care
of the proper use of that memory. Also some real-time kernels (such as
&rtlinux;) have all their tasks share the same address space (even if
the processor supports memory management), because this allows more
efficient code.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Dedicated versus general</emphasis>.
 </para>
 <para>
For many applications, it is worthwhile not to use a commercially or
freely available
operating system, but write one that is optimised for the task
at hand. Examples are the operating systems for mobile phones, or
Personal Digital Assistants. Standard operating systems would be too
big, and they don't have the specific signal processing support
(speech and handwriting recognition) that is typical for these
applications. Some applications even don't need an
operating system at all. (For example, a simple vending machine.) The
trade-offs here are: cost of development and decreased portability,
against cost of smaller and cheaper embedded systems.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Operating system versus language runtime</emphasis>.
 </para>

 <para>
Application programs make use of &ldquo;lower-level&rdquo; primitives
to build their functionality. This functionality can be offered by the
operating system (via system calls), or by a programming language (via
language primitives and libraries). Languages such as &cpp;,
&ada; and &java; offer lots of
functionality this way: memory management, threading, task
synchronization, exception handling, etc. This functionality is
collected in a so-called <emphasis>runtime</emphasis>. The advantages
of using a runtime are: its interface is portable over different
operating systems, and it offers ready-to-use and/or safe solutions to
common problems. The disadvantages are that a runtime is in general
&ldquo;heavy&rdquo;, not deterministic in execution time, and not very
configurable. These disadvantages are important in real-time and
embedded contexts.
 </para>
 </listitem>
</itemizedlist>
</para>

</sect1>


<sect1 id="sect-time">
<title>Time</title>
<para>
Not surprisingly, &ldquo;time&rdquo; plays an important role in the
design and use of a real-time operating system. This Section
introduces some relevant terminology and definitions.
</para>

<sect2 id="def-rt">
<title>Real time</title>
<para>
Probably you'll find as many interpretations of the meaning of
<emphasis>real time</emphasis> as you find publications on this topic. One
simple definition is:
<blockquote>
 <para>
A real-time operating system is able to execute all of its tasks
without violating <emphasis>specified</emphasis> timing constraints.
 </para>
</blockquote>
Another definition is:
<blockquote>
 <para>
Times at which tasks will execute can be <emphasis>predicted
deterministically</emphasis> on the basis of knowledge about the
system's hardware and software.
 </para>
</blockquote>
</para>

<para>
That means, if the hardware <emphasis>can</emphasis> do the job, the
RTOS software <emphasis>will</emphasis> do the job deterministically.
(This determinism must be softened a bit, because of the 
&ldquo;stochastic&rdquo; nature
of the inevitable scheduling &ldquo;jitter&rdquo;, see
<xref linkend="latency">.)
</para>

<para>
One often makes distinction between &ldquo;soft real time&rdquo; and
&ldquo;hard real time&rdquo;. &ldquo;Soft&rdquo; indicates that not
meeting the specified timing constraints is not a disaster, while it
<emphasis>is</emphasis> a disaster for a hard real-time system. For
example: playing an audio or video file is soft real time, because few
people will notice when a sample comes a fraction of a second too
late.  Steering a space probe, on the other hand, requires hard real
time, because the rocket moves with a velocity of several kilometers
per second such that small delays in the steering signals add up to
significant disturbances in the orbit which can cause erroneous
atmosphere entry situations. Precision mills and high-accuracy
radiation or surgical robots are other examples that require hard
real-time: moving the mill or the robot one tenth of a millimeter too
far due to timing errors can cause the rejection of produced parts, or
the death of patients.
</para>
<para>
Practically speaking, the distinction between soft and hard real time is
often (implicitly and mistakenly) related to the time scales involved in
the system: in this reasoning, soft real-time tasks must typically be
scheduled with (coarser than)
<emphasis>milli-seconds</emphasis> accuracy, and hard real-time tasks with
<emphasis>micro-seconds</emphasis> accuracy. But this implicit assumption
has many exceptions! For example, a one-dollar 4 bit processor
controlling a traffic light can be more hard real time (in the sense
of &ldquo;deterministic&rdquo;) than a 5000 dollar
<hardware>Athlon</hardware>-based e-commerce server.
</para>
</sect2>


<sect2 id="latency">
<title>Latency</title>
<para>
The <emphasis>latency</emphasis> (or <emphasis>tardiness</emphasis>) of a
task is the difference between the instant of time on which the task
should have started (or finished) and the instant of time on which it
actually did. (Or, in different contexts, the time between the
<emphasis>generation</emphasis> of an event, and
its <emphasis>perception</emphasis>.) Latencies are due to several
factors: (i) the timing properties of processor, bus, memory (on-chip
cache, off-chip RAM and ROM) and peripheral devices, (ii) the
scheduling properties of the OS, (iii) the
<emphasis>pre-emptiveness</emphasis> of its kernel, (iv) the
load on the system (i.e., the number of tasks that want to be scheduled
concurrently), and (v) the <emphasis>context switch</emphasis> time.
This latter is the time the processor needs to save the data of the
currently running task (e.g., registers, stack, and instruction
pointer), and to replace it with the local data of the newly scheduled
task. Few of these factors are constant over time, and the
statistical distribution of the latencies in the subsequent
schedulings of tasks is called the <emphasis>jitter</emphasis>.
</para>

<para>
This is a far from exhaustive list of kernel activities that introduce
<emphasis>indeterminism</emphasis> into the timing behaviour of a
(general purpose) operating system:
<itemizedlist>
 <listitem>
 <para>
<emphasis>Accessing the hard disk.</emphasis> Because the alignment of
sectors, and the distance between the tracks needed by a given task
are variable, that task cannot be sure about how long it will take to
access the data it needs. In addition, hard disks are mechanical
devices, whose time scales are much longer than purely electronic
devices (such as RAM memory); and accesses to the hard disk are
<emphasis>buffered</emphasis> in order to reduce
<emphasis>average</emphasis> disk access time.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Accessing a network.</emphasis>
Especially with the <acronym>TCP/IP</acronym>
protocol, that re-sends packets in case of transmission errors.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Low-resolution timing.</emphasis>
See <xref linkend="timers">.
 </para>
 </listitem>

 <listitem>
 <para>
Another delay related to time keeping is the fact that
<emphasis>programming the timer chip</emphasis> often generates
unpredictable delays. This delay is of the order of microseconds, so
only important for really high-accuracy timing.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Non-real-time device drivers.</emphasis>
Device drivers are often sloppy about their
time budget: they use busy waiting or roughly estimated sleeping
periods, instead of timer interrupts, or lock resources longer than
strictly necessary, or run in user space with the corresponding timing
unpredictability.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Memory allocation and management.</emphasis>
After a task has asked for more memory (e.g., through a
<function>malloc</function> function call), the time that the memory
allocation task needs to fulfill the request is unpredictable. Especially
when the allocated memory has become strongly fragmented and no contiguous
block of memory can be allocated.
Moreover, a general purpose operating system swaps code and data out
of the physical memory when the total memory requirements of all tasks is
larger than the available physical memory.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis><function>proc</function> file system.</emphasis> This is the
very rich (non-graphical) user interface to what is going on
inside the &linux; kernel: all this information is offered to user tasks in
the form of &ldquo;files&rdquo; in this (virtual!) file system. However,
accessing information in this file system implies significant overhead in
some cases, because the files are virtual: they are &ldquo;created&rdquo;
only when needed.
 </para>
 </listitem>
</itemizedlist>
</para>

<para>
The exact <emphasis>magnitude</emphasis> of all the above-mentioned
time delays changes very strongly between different hardware. Hence,
it is not just the operating system software that makes the
difference. For some applications, the context switch time is most
important (e.g., for sampling audio signals at 44kHz), while other
applications require high computational performance, at lower
scheduling frequencies (e.g., robot motion control at 1kHz). But
again, some tasks, such as speech processing, require both.
</para>
</sect2>

<sect2 id="rtos-time-constraints">
<title>Timing constraints</title>
<para>
Different applications have different timing constraints, 
which, ideally, the &rtos; should be able to satisfy. However, there
still doesn't exist general and guaranteed scheduler algorithms
(<xref linkend="task-management">) that are able to satisfy all the
following classes of time constraints:
<itemizedlist>
 <listitem>
  <para>
  <emphasis>Deadline</emphasis>: a task has to be completed
before a given instant in time, but when exactly the task is performed
during the time interval between now and the deadline is not important for
the quality of the final result. For example: the processor must fill the
buffer of a sound card before that buffer empties; the voltage on an output
port must reach a given level before another peripheral device comes and
reads that value.
  </para>
 </listitem>
 <listitem>
  <para>
  <emphasis>Zero execution time</emphasis>: the task must be
performed in a time period that is zero in the ideal case. For example:
digital control theory assumes that taking a measurement, caculating the
control action, and sending it out to a peripheral device all take place
instantaneously.
  </para>
 </listitem>
 <listitem>
  <para>
  <emphasis><anchor id="sect-qos">Quality of
Service</emphasis> (&qos;): the task must get a fixed amount of
&ldquo;service&rdquo; per time unit.  (&ldquo;Service&rdquo; often
means &ldquo;&cpu; time&rdquo;, but could also be &ldquo;memory
pages&rdquo;, &ldquo;network bandwidth&rdquo; or &ldquo;disk access
bandwidth&rdquo;.) This is important for applications such as
multimedia (in order to read or write streaming audio or video data to
the multimedia devices), or network servers (both in order to
guarantee a minimum service as in order to avoid &ldquo;denial of
service&rdquo; attacks).
  </para>
  <para>
The &qos; is often specified by means of a
small number of parameters: &ldquo;s&rdquo; seconds of service in each
time frame of &ldquo;t&rdquo; seconds. A specification of 5
micro-seconds per 20 micro-seconds is a much more real-time &qos; than
a specification of 5 seconds per 20 seconds, although, on the average,
both result in the same amount of time allotted to the task.
  </para>
 </listitem>
</itemizedlist>
The major problem is that the scheduler needs complete knowledge about
how long each task is going to take in the near future, and when it
will become ready to run. This information is practically impossible
to get, and even when it is available, calculation of the optimal
scheduling plan is a search problem with high complexity, and hence
high cost in time.
</para>

<para>
Different tasks compete for the same resources: processors, network,
memory, disks, &hellip; Much more than in the general purpose OS case,
programmers of real-time systems have to take into account
<emphasis>worst-case</emphasis> scenarios:  if various tasks
<emphasis>could</emphasis> be needing a service, then sooner or later
they <emphasis>will</emphasis> want it at the same time. 
</para>

</sect2>

<sect2 id="timers">
<title>Time data structures</title>
<para>
The smallest time slice used in most general
purpose operating system is longer than 1 millisecond.
Not because the processors are not fast enough to do significant
amounts of work in that time slice, but because 32 bit machines have
only 2^32 time slices before their timing counter runs over. At 1000
ticks per second, this corresponds to less than 50 days, which is
certainly insufficient for servers and embedded systems.
&linux; uses a scheduling time slice (&ldquo;jiffie&rdquo;) of 10
milliseconds on most processors. (1 milliseconds on
<hardware>Alpha</hardware>, which has 64 bit counters.)
 </para>
 <para>
The timing constraints of real-time tasks are often expressed with
much higher resolutions than those of the general purpose scheduler,
i.e., (less than) microseconds instead of milliseconds.  Hence, the
data structure
in which the time is kept should be adapted to this higher rate, in
order to avoid overflow. For example, the real-time &linux; variants
(<xref linkend="rt-linux-variants">) use a
<emphasis>high-resolution time data structure</emphasis> that counts
time in <emphasis>nanoseconds</emphasis>.
A 64 bit integer should do the job in that case, but 32
bits could be too dangerous. (A 32 bit counter overflows after about
4 seconds when counting at a 1 nanosecond rate!) Note that not all
compilers can deal with 64 bit integers, such that some assembly
coding may be required in some cases.
 </para>
 <para>
&posix; has standardized &ldquo;clocks<indexterm>
<primary>clock</primary></indexterm>&rdquo;
<indexterm>
 <primary>&posix;</primary><secondary>clock</secondary>
</indexterm>
and &ldquo;timers<indexterm> 
<primary>timer</primary></indexterm>&rdquo;
<indexterm>
 <primary>&posix;</primary><secondary>timer</secondary>
</indexterm>
The <parameter>timespec</parameter> is a data structure that keeps the
time in two separate seconds and nanoseconds sub-structures
(<filename>include/linux/time.h</filename> of the &linux; source tree):
<programlisting>
<![CDATA[
typedef long             __kernel_time_t; // include/asm/posix_types.h
typedef __kernel_time_t  time_t;

struct timespec {
   time_t  tv_sec;         /* seconds,  */
   long    tv_nsec;        /* nanoseconds */
};
]]>
</programlisting>
The <parameter>timespec</parameter> data structure uses 64 bits, but
the separation between seconds and
nanoseconds is an inefficient way of representing time: there
are only approximately 2^30 = 10^9 nanoseconds in one second. So,
a little more than two bits of the nanoseconds field are not
used. This means that, at each and every addition of a time increment,
the software has to check whether the boundary of 1 second hasn't been
reached, such that the second field has to be updated. This is more
complicated than just having a 64 bit counter that can keep on count
without having to check.
</para>

</sect2>

</sect1>


<sect1 id="eos">
<title>Embedded OS</title>
<para>
<indexterm>
<primary>embedded OS</primary></indexterm>
<indexterm>
<primary>operating system</primary><secondary>embedded</secondary></indexterm>
The concepts introduced in the previous sections apply of course
also to embedded operating systems (&ldquo;&eos;&rdquo;). Embedded
operating systems, however, have some features that distinguish them from
real-time and general purpose operating systems.
But the definition of an &ldquo;embedded operating system&rdquo; is
probably even more ambiguous than that of an &rtos;, and they come in
a zillion different forms. But you'll recognize one when
you see one, although the boundary between general purpose operating
systems and embedded operating systems is not sharp,
and is even becoming more blurred all the time. 
</para>

<para>
Embedded systems are being installed in tremendous quantities (an order of
magnitude more than desktop PCs!): they control lots of functions in modern
cars; they show up in household appliances and toys; they control vital
medical instrumentation; they make remote controls and
<acronym>GPS</acronym> (Global Position Systems) work; they make your
portable phones work; etc.
</para>

<para>
The simplest classification between different kinds of embedded
operating systems is as follows:
<itemizedlist>
 <listitem>
 <para>
<emphasis>High-end embedded systems.</emphasis>
These systems are often down-sized derivatives of an existing general
purpose OS, but with much of the &ldquo;balast&rdquo; removed.
&linux; has given rise to a large set of such derivatives, because of
its highly modular structure and the availability of source code.
Examples are: routers, switches, personal digital assistants, set top
boxes.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Deeply embedded OS.</emphasis> These OSs must be really
<emphasis>very</emphasis> small, and need only a handful of basic
functions. Therefore, they are mostly designed from the ground up for
a particular application. Two typical functions deeply embedded
systems (used to) lack are high-performance graphical user interfacing
or network communication.  Examples are: automotive controls, digital
cameras, portable phones. But also these systems get more graphics and
networking capabilities&hellip;
 </para>
 </listitem>
</itemizedlist>
</para>
<para>
The most
important features that make an OS into an embedded OS are:
<itemizedlist>

<listitem>
<para>
Small <emphasis>footprint</emphasis>.
Designers are continuously trying to put more computing power in
smaller housings, using cheaper &cpu;s, with on-board digital and/or
analog IO; and they want to integrate these &cpu;s in all kinds of
small objects.  A small embedded OS also often uses only a couple of
kilobytes of RAM and ROM memory. 
</para>
</listitem>

<listitem>
<para>
The embedded system should run for years without manual intervention.  This
means that the hardware <emphasis>and</emphasis> the software should never
fail.  Hence, the system should preferably have no mechanical parts, such
as floppy drives or hard disks.  Not only because mechanical parts are more
sensitive to failures, but they also take up more space, need more energy,
take longer to communicate with, and have more complex drivers
(e.g., due to motion control of the mechanical parts). 
</para>
</listitem>

<listitem>
<para>
Many embedded systems have to control devices that can be dangerous if
they don't work exactly as designed. Therefore, the status of these
devices has to be checked regularly. The embedded computer system
itself, however, is one of these critical devices, and has to be
checked too! Hence, one often sees
<emphasis>hardware watchdogs</emphasis><indexterm>
<primary>hardware watchdog</primary></indexterm>
<indexterm>
 <primary>watchdog</primary><secondary>hardware</secondary>
</indexterm>
included in embedded systems.
These watchdogs are usually retriggerable monostable timers attached
to the processor's reset input. The operating system checks within
specified intervals whether everything is working as desired, for
example by examining the contents of status registers. It then resets
the watchdog.  So, if the OS doesn't succeed in resetting the timer,
that means that the system is not functioning properly and the timer
goes off, forcing the processor to reset. 
</para>
<para>
If something went wrong but the OS is still working (e.g., a memory
protection error in one of the tasks) the OS can activate a
<emphasis>software watchdog</emphasis>,<indexterm>
<primary>software watchdog</primary></indexterm>
<indexterm>
 <primary>watchdog</primary><secondary>software</secondary>
</indexterm>
which is nothing else but an interrupt that schedules a service
routine to handle the error. One important job of the software
watchdog could be to generate a
<emphasis>core dump</emphasis>,<indexterm>
<primary>core dump</primary></indexterm>
to be used for analysis of what situations led to the crash.
</para>
</listitem>

<listitem>
<para>
A long autonomy also implies using as little power as possible:
embedded systems often have to live a long time on batteries (e.g.,
mobile phones), or are part of a larger system with very limited power
resources (e.g., satellites).
</para>
</listitem>

<listitem>
<para>
If the system does fail despite its designed robustness (e.g., caused
by a memory protection fault, <xref linkend="rtos-mem-management">),
there is usually no user around to take the appropriate actions.
Hence, the system itself should reboot autonomously, in a
&ldquo;safe&rdquo; state, and &ldquo;instantly&rdquo; if it is
supposed to control other critical devices. Compare this to the
booting of your desktop computer, which needs a minute or more before
it can be used, and always comes up in the same default state&hellip;
</para>
</listitem>

<listitem>
<para>
It should be as cheap as possible. Embedded systems are often produced
in quantities of several thousands or even millions. Decreasing the
unit price even a little bit boils down to enormous savings.
</para>
</listitem>

<listitem>
<para>
Some embedded systems are not physically reachable anymore after they
have been started (e.g., launched satellites) in order to add software
updates. However, more and more of them can still be accessed
remotely. Therefore, they should support
<emphasis>dynamic linking</emphasis>: object code that did not exist
at the time of start is uploaded to the system, and linked in the
running OS without stopping it.
</para>
</listitem>

</itemizedlist>
Some applications require all features of embedded
<emphasis>and</emphasis> real-time operating systems. The best known
examples are mobile phones and (speech-operated) handheld computers
(&ldquo;PDA&rdquo;s): they must be small, consume little power, and
yet be able to execute advanced signal processing algorithms, while
taking up as little space as possible.
</para>
<para>
The above-mentioned arguments led embedded OS developers to design
systems with the absolute minimum of software and hardware. Roughly
speaking, developers of general purpose and real-time operating
systems approach their clients with a
&ldquo;<emphasis>Hey, look how much we can do!</emphasis>&rdquo;
marketing strategy; while &eos; developers say
&ldquo;<emphasis>Hey, look how little we need to do what you
want!</emphasis>&rdquo;.
Hence, embedded systems often come without a memory management
unit<indexterm><primary>memory management unit</primary></indexterm>
(<acronym>MMU</acronym>),<indexterm><primary>MMU</primary></indexterm>
multi-tasking, a networking &ldquo;stack&rdquo;, or file systems. The
extreme is one single monolithic program on the bare processor, thus
completely eliminating the need for any operating system at all.
</para>
<para>
Taking out more and more features of a general purpose operating
system makes its footprint smaller and its predictability higher. On
the other hand, adding more features to an &eos; makes it look like a
general purpose OS. Most current &rtos; and &eos; operating systems
are expanding their ranges of application, and cover more of the
full &ldquo;feature spectrum.&rdquo;
</para>

</sect1>


<sect1 id="os-standards">
<title>Operating system standards</title>
<para>
<indexterm>
 <primary>standard</primary><secondary>operating system</secondary>
</indexterm>
Real-time and embedded systems are not a user product in themselves,
but serve as platforms on which to build applications. As for any
other software platform, the availability of standards facilitates the
job of programmers enormously, because it makes it easier, cheaper and
faster to develop new applications, and to port an existing application
to new hardware. In the world of real-time
and embedded systems, standardization is not a burning issue, because
many projects in this area have unique requirements, need unique
extensions to already existing products, don't need frequent updates
by different people, and are seldom visible to end-users.
All these &ldquo;features&rdquo; do not really help in forcing developers
to use standards&hellip; (They do like standard
<emphasis>tools</emphasis> though, which is one reason for the
popularity of the Free Software &gnu; tools.)
</para>

<para>
This Section lists some standardization efforts that exist in
the real-time and embedded world.
</para>


<sect2 id="standards-posix">
<title>POSIX</title>
<para>
&posix;<indexterm><primary>&posix;</primary></indexterm>
(&ldquo;Portable Operating Systems Interface&rdquo;, a name
that Richard Stallman<indexterm>
<primary>Stallman, Richard</primary></indexterm>
came up with) is a standard for the function
calls (the <emphasis>Application Programming Interface</emphasis>,
&api;) of &unix;-like general purpose operating systems. &posix; has
some specifications on real-time primitives too. Its definition of
real time is quite loose:
<blockquote>
<para>
The ability of the operating system to provide a required level of
service in a bounded response time.
</para>
</blockquote>
The standard is managed by the 
<ulink
 url="http://www.pasc.org/">Portable Application Standards Committee</ulink>
(<acronym>PASC</acronym>) of the
<ulink
 url="http://www.ieee.org">Institute for Electrical and Electronic Engineers</ulink>
(<acronym>IEEE</acronym>), and is not freely available.
There is an extensive <emphasis>Rationale</emphasis> document, that
explains the reasons behind the choices that the &posix; committees
made, as well as lots of other interesting remarks. That document can
be found
<ulink
 url="http://www.opengroup.org/onlinepubs/007904975/xrat/contents.html">here
</ulink>.
</para>
<para>
The &posix; components relevant to real-time are: 1003.1b (real-time),
1003.1d (additional real-time extensions), 1003.1j (advanced real-time
extensions). See
<ulink
 url="http://www.opengroup.org/onlinepubs/007904975/idx/realtime.html">this
link</ulink> or
<ulink
 url="http://www.unix-systems.org/version3/ieee_std.html">here (IEEE Std
1003.1-2001)
</ulink>
for more details. These standards are
often also denoted as <acronym>ANSI/IEEE Std. 1003.1b</acronym>, etcetera.
</para>
<para>
&posix; also defines four so-called
<emphasis>profiles<indexterm>
<primary>&posix; profiles</primary></indexterm></emphasis>
<indexterm>
 <primary>profile</primary><secondary>&posix;</secondary>
</indexterm>
for real-time systems:
<itemizedlist>

<listitem>
<para>
<emphasis>PSE51 (Minimal Realtime System Profile)</emphasis>.
This profile offers the basic set of functionality for a single
process, deeply embedded system, such as for the
unattended control of special I/O devices. Neither user
interaction nor a file system (mass storage) is required. The system
runs one single &posix; process, that can run multiple &posix;
threads. These threads can use &posix; message passing. The process
itself can use this message passing to communicate with other
PSE5X-conformant systems (e.g., multiple &cpu;s on a common backplane,
each running an independent PSE51 system).
The hardware model for this profile assumes a single processor with
its memory, but no memory management unit (MMU) or common I/O devices
(serial line, ethernet card, etc.) are required.
</para>
</listitem>

<listitem>
<para>
<emphasis>PSE52 (Realtime Controller System Profile)</emphasis>.
This profile is the PSE51 profile, plus support for a file
system (possibly implemented as a RAM disk!)
and <emphasis>asynchronous</emphasis> I/O.
</para>
</listitem>

<listitem>
<para>
<emphasis>PSE53 (Dedicated Realtime System Profile)</emphasis>.
This profile is the PSE51 profile, plus support for multiple
processes, but minus the file system support of the PSE52 profile. The
hardware can have a memory management unit.
</para>
</listitem>

<listitem>
<para>
<emphasis>PSE54 (Multi-Purpose Realtime System Profile)</emphasis>.
This is the superset of the other profiles and essentially consists of
the entire POSIX.1, POSIX.1b, POSIX.1c and.or POSIX.5b standards.
Not all processes or threads must be real-time. 
Interactive user processes are allowed on a PSE54 system, so all of
POSIX.2 and POSIX.2a are also included. 
The hardware model for this profile assumes one or more processors
with memory management units, high-speed storage devices, special
interfaces, network support, and display devices. 
</para>
</listitem>

</itemizedlist>
&rtlinux; claims to comply to the <emphasis>PSE51</emphasis> profile;
&rtai; claims nothing.
</para>

<para>
&linux;'s goal is &posix; compliance, but not blindly, and not at all
costs. The <filename>/usr/include/unistd.h</filename> header file
gives information about which parts of the standard have been
implemented already. For example: the implementation of threads
(see <xref linkend="task-management">), and the scheduler modes
(see <xref linkend="sched-linux">). Many of the real-time &posix;
extensions have already been implemented in &rtlinux; and &rtai;
(see <xref linkend="rt-linux-variants">). 
</para>

</sect2>

<sect2 id="standards-unix98">
<title>Unix98</title>
<para>
<emphasis>&unix;</emphasis> (<acronym>UNIX98<indexterm>
<primary>UNIX98</primary></indexterm></acronym>,
<emphasis>Single UNIX Specification, Version 2</emphasis>)
is the standardization of &unix; operating systems driven by the
<ulink url="http://www.unix-systems.org/unix98.html">Open Group</ulink>.
It incorporates a lot of the &posix; standards.
</para>

</sect2>

<sect2 id="standards-elix">
<title>EL/IX</title>
<para>
<emphasis>&elix;.<indexterm>
<primary>&elix;</primary></indexterm></emphasis>
The <ulink url="http://sources.redhat.com/elix/">&elix;</ulink> &api; for
embedded systems wants to be a standards-compliant subset of &posix;
and <acronym>ANSI C</acronym>.
</para>

</sect2>


<sect2 id="standards-uitron">
<title>&uitron</title>
<para>
<emphasis>&uitron;.<indexterm>
<primary>&uitron;</primary></indexterm></emphasis>
<ulink url="http://www.itron.gr.jp/home-e.html">&uitron;</ulink> is a
Japanese standard for embedded systems.
&ldquo;<acronym>TRON</acronym>&rdquo; stands for
<emphasis>The Real-time Operating system Nucleus</emphasis>; the
letter &ldquo;<acronym>I</acronym>&rdquo; stands for
<emphasis>industrial</emphasis>, and the
&ldquo;mu&rdquo; for <emphasis>micro</emphasis>.
(There are other <acronym>TRON</acronym>s too:
<acronym>BTRON</acronym> for business, <acronym>CTRON</acronym>
for communication, &hellip;)
</para>

</sect2>


<sect2 id="standards-osek">
<title>OSEK</title>
<para>
<emphasis>&osek;<indexterm>
<primary>&osek;</primary></indexterm>.</emphasis>
<ulink url="http://www.osek-vdx.org/">&osek;</ulink> is a German
standard for an open architecture for distributed vehicle control
units. The architecture is open, but no free software implementation
is available.
</para>

</sect2>


<sect2 id="standards-rtsj">
<title>Real-Time Specification for Java</title>
<para>
The <emphasis>Real-Time Specification for Java<indexterm>
 <primary>Real-Time Specification</primary>
 <secondary>for &java;</secondary></indexterm>
(RTSJ).</emphasis>
(<ulink url="http://jcp.org/jsr/detail/1.jsp">Java Community
Process</ulink>, <ulink url="http://www.rtj.org/">rtj.org</ulink>.)
is not really an operating system, but a <emphasis>runtime<indexterm>
<primary>runtime</primary></indexterm></emphasis> for a programming
language. The distinction is not really fundamental for normal
desktop use; it can be enormous for real-time use, because a runtime
must make use of the services of the underlying operating system. That
means that a runtime with real-time features is useless on a non
real-time operating system.
</para>
<para>
This specification was
released in 2001, and, similar to the &posix; specifications, it is
<emphasis>not</emphasis> an implementation; some commercial
implementations are already available.
The basis prescriptions of the specification are:
<itemizedlist>

<listitem>
<para>
Implementations of the specification are allowed to introduce their
own optimizations and extensions, such as, for example,
scheduling algorithms or garbage collection.
</para>
</listitem>

<listitem>
<para>
The <emphasis>minimum</emphasis> task management includes static
priority-based preemptive scheduling, with at least 28 priority
levels.
</para>
</listitem>

<listitem>
<para>
Priority inversion &ldquo;prevention&rdquo;
(<xref linkend="sect-prior-inherit">) is mandatory.
</para>
</listitem>

<listitem>
<para>
An implementation must include classes that provide an
asynchronous event mechanism.
</para>
</listitem>

<listitem>
<para>
Exceptions must be allowed to change the context to another thread.
</para>
</listitem>

<listitem>
<para>
Clases must be provided to allow direct access to physical memory.
</para>
</listitem>

</itemizedlist>
</para>

</sect2>

<sect2 id="standards-ada95">
<title>Ada 95</title>
<para>
<emphasis>Ada 95 real-time specifications.<indexterm>
 <primary>Real-Time Specification</primary>
 <secondary>for &ada; 95</secondary></indexterm> </emphasis>
</para>
<para>
The <ulink url="http://marte.unican.es/">MaRTE OS</ulink> is an
example of a free software real-time kernel for embedded applications
that complies with Minimal Real-Time POSIX.13. Most of its code is
written in Ada with some C and assembler parts. The &ada; runtime
from the 
<ulink url="ftp://ftp.cs.nyu.edu/pub/gnat">GNU Ada Toolkit (GNAT)</ulink>
has been adapted to run on the kernel. The &ada; compiler comes under
the &gpl;, but the runtime has a modified &gpl; license that allows
it to be used without constraints in commercial systems.
</para>
<para>
<ulink url="http://polaris.dit.upm.es/~ork/">OpenRavenscar</ulink>
is another free software real-time kernel in &ada;.
</para>

</sect2>

<sect2 id="standards-rtcorba">
<title>Real-Time CORBA</title>
<para>
The <emphasis>Open Management Group (OMG)</emphasis>
<indexterm><primary>Open Management Group</primary></indexterm>
<indexterm><primary>OMG</primary></indexterm>
has released a specification of a &ldquo;real-time&rdquo; component
broker interface, called <emphasis>Real-Time &corba;</emphasis>
(&ldquo;&rtcorba;&rdquo;).
This is <emphasis>not</emphasis> a piece of software, but a
<emphasis>specification</emphasis> interface. So, various
implementations can satisfy the interface, with very different
real-time behaviour. The &rtcorba; specifications allow the component
builder to specify some <emphasis>desired</emphasis> properties that
are common for real-time tasks, such as static priority levels or
time-outs. These specifications have to be mapped onto real (RT)OS
primitives by the specific implementation(s) used in the application.
</para>

</sect2>

</sect1>


<sect1 id="linux-rtos-eos">
<title>Linux for real-time and embedded</title>
<para>
Linux is a <emphasis>general purpose</emphasis> operating system, with
a non-pre-emptable kernel: it wants to give all tasks a
<emphasis>fair</emphasis> share of the resources (processor, memory,
peripheral devices, &hellip;), and it doesn't interrupt kernel
activities.  &linux;'s basic user space
scheduler is of the <emphasis>time slicing</emphasis> type: it gives
more or less equal time slices to different tasks.  It is possible to
change the priorities of user space tasks to some extent (using the
<function>nice</function> command), but not enough to make the
scheduling deterministic.  Other reasons why &linux; is a poor &rtos;
are the unpredictable delays caused by non-pre-emptable operations
running in kernel space, and by the mere size of that kernel. Indeed,
<emphasis>nobody</emphasis> can understand the kernel sufficiently
well to be able to predict how long a certain operation is going to
take.
</para>

<para>
All remarks above hold for all general purpose operating
systems, such as <productname>Windows</productname>,
<productname>AIX</productname>, <productname>IRIX</productname>,
<productname>HP-UX</productname>, <productname>Solaris</productname>,
etc.  It may sound strange at first, but &ldquo;good old&rdquo;
<productname>DOS</productname> was much closer to being an &rtos; than
Linux, because its scheduler was much less &ldquo;fair&rdquo; and
advanced, and it had fewer system services to look after. (However,
DOS is only an advantage if there is only <emphasis>one</emphasis>
real-time task!) Because none of the desktop or server operating
systems is a good candidate for real-time and/or embedded
applications, several companies have started to develop special
purpose operating systems, often for quite small markets.  Many of
them are UNIX-like, but they are not mutually compatible. The market
is very fragmented, with several dozens of &rtos;s, none of which
holds a majority of the market. At least, this was the case
<emphasis>before</emphasis> &linux; appeared on the radar of real-time
and embedded system companies. Since about the year 2000, the market
has seen lots of mergers and acquisitions, and substantial efforts
from the established &rtos; companies to become as
&ldquo;&linux;-compliant&rdquo; as possible.
</para>

<para>
The fact that Microsoft tries to enter the market too (with its
<application>PocketPC/Windows CE</application> product line) is only
accelerating this evolution.  History has learned that the fragmented
<application>UNIX</application> desktop and server markets were easy
targets for Microsoft&hellip;, even with inferior technology. So,
hopefully the competitors have learned from this experience.
</para>

<para>
While the &linux; kernel people, headed by Linus Torvalds, are very
keen on making the general support and performance of &linux; better,
their interest in real time is very small, to say the least&hellip; No
efforts to make &linux; into a real &rtos; have to be expected from
that side, but the kernel <emphasis>is</emphasis> evolving towards
higher pre-emptability (first of all, because this is necessary if one
wants to scale &linux; to more than, say, two CPUs).
</para>

<para>
Torvalds has mentioned two reasons why he doesn't want to make &linux; into
a real-time operating system:
<itemizedlist>
 <listitem>
 <para>
Computers are getting faster all the time, such that a general-purpose
operating system will satisfy the requirements of more and more
&ldquo;real-time&rdquo; users. (That is, those that require a
<emphasis>fast</emphasis> system, which is not the same as a
<emphasis>deterministic</emphasis> system.)
 </para>
 </listitem>
 <listitem>
 <para>
Offering hard real-time features in a general-purpose OS will quickly
result in
<ulink url="http://kernelnotes.org/lnxlists/linux-kernel/lk_0006_05/msg00180.html">&ldquo;bad
behaviour&rdquo; of application programmers</ulink>: they will all
want their application to perform best, and program it with high
priority. Experience has shown many times that this leads to
incompatible timing constraints between different applications rather
sooner than later.
 </para>
 </listitem>
</itemizedlist>
However, there are no technical reasons why &linux; would not be able
to become (more of) an &rtos;, and much technology to make &linux; more
powerful on the high-end server systems is also useful for real-time
and embedded purposes: real multi-threading in the kernel, finer locks
and scheduling points needed for SMP systems, migration of processes
over &cpu;s, &ldquo;hot-swapable&rdquo; devices, etc.
</para>

<para>
Anyway, quite a lot of Free Software efforts have started to contribute
software in the area of real-time and embedded systems. These
contributions can be classified as follows:
<itemizedlist>
 <listitem>
 <para>
<emphasis>Eliminating functionalities from the standard Linux
kernel.</emphasis>
 </para>
 <para>
This approach aims at reducing the memory footprint of the operating
system, and is hence mainly focused on embedded systems.
<ulink url="http://www.uclinux.org">&uclinux;</ulink> is an example.
Other projects develop small and simple &ccc;
libraries, because the current versions of the <acronym>GNU</acronym>
tools have become quite large; for example,
<ulink url="http://www.busybox.net">BusyBox</ulink> (a replacement
for most of the utilities one usually finds in the &gnu; fileutils,
shellutils, etc.);
<ulink url="http://www.uclibc.org">&mu;clibc</ulink> (a small version
of the general &ccc; library).
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Patches to the standard Linux kernel.</emphasis>
 </para>
 <para>
This approach replaces the standard scheduler of &linux; with a more
deterministic scheduling algorithm, and adds scheduling points to the
&linux; source tree, in order to make the kernel more responsive.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Real-time patches underneath the Linux kernel.</emphasis>
 </para>
 <para>
This approach runs &linux; as a low-priority process in a small
real-time kernel. This kernel takes over the real hardware from
&linux;, and replaces it with a software simulation.
 </para>
 <para>
The two major examples that follow this road are
<ulink url="http://www.rtlinux.org/">&rtlinux;</ulink>
(<xref linkend="sect-rtlinux">)
and
<ulink url="http://www.rtai.org/">&rtai;</ulink>
(<xref linkend="sect-rtai">).
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Linux-independent operating systems.</emphasis>
 </para>
 <para>
These projects have been developed completely independently from
&linux;, and some of them are even older.
Some examples are
<ulink url="http://www.rtems.com/">&rtems;</ulink>, and
<ulink url="http://sources.redhat.com/ecos/">&ecos;</ulink>.
 </para>
 </listitem>
</itemizedlist>
</para>


<para>
The &linux; kernel that supports a typical desktop computer is
several hundreds of kilobytes large. And that does
<emphasis>not</emphasis> include the memory taken up by the &linux;
tools and the users' applications. Hence, the &linux; footprint is too
large for many embedded systems.  It also takes about a minute or so
to boot a PC-like computer, which is much too long for most embedded
systems. And it expects a hard disk to work with, and a power supply
of more than 100 Watts for modern high-end &cpu;s, video cards and
hard disks.
</para>

<para>
However, one of the nicest things about &linux; is its enormous
configurability, of which you can get a taste if you compile your own
&linux; kernel. That kernel can be constructed out of lots of more or
less independent modules, and you just leave out the modules that are
not needed in your system. If your application requires no ethernet
card, leave out the network drivers; if you don't need a screen, why
bother with installing the X Window System; etc. This means that many
people have configured &linux;-derived systems that become so small
that they fit on a single floppy. 
</para>

<para>
The previous paragraphs may suggest that &linux; proper has no chance
at all at being used as an embedded OS. However, it has some
advantages that may turn out to be decisive in the not too distant
future (certainly because memory and &cpu;s become cheaper): its
configurability, its ability to be administered from a distance,
<emphasis>and</emphasis> its many ways to add security features.
</para>

</sect1>

</chapter>


<chapter id="task-management">
<title>Task management and scheduling</title>
<para>
This Chapter explains what <emphasis>task management</emphasis> means,
and how it can influence the real-time behaviour of an operating system.
Concrete examples come from the &posix; standard, but the concepts are
identical for other task management &api;s.
<emphasis>Scheduling</emphasis> of tasks is one of the
responsibilities of the task management with influence on the
real-time behaviour of the system. Other responsibilities are: task
creation and deletion, linking tasks to interrupts and deferred
interrupt servicing, and assignment and change of scheduling
priorities.
</para>

<sect1 id="process-thread">
<title>Processes and threads</title>
<para>
We use &ldquo;task&rdquo; as the generic name for both
<emphasis>processes</emphasis> and <emphasis>threads</emphasis>.
A process is the normal &ldquo;unit of execution&rdquo; in &unix;
systems: if you compile a C program that has one single
<function>main()</function>, then running this program requires one
process. (That process can generate itself other processes too, of
course.) The operating system must provide several services to each
process: memory pages (in virtual memory and in physical
<acronym>RAM</acronym>) for code, data, stack and heap, and for file
and other descriptors; registers in the &cpu;; queues for scheduling;
signals and &ipc;; etc.
</para>

<para> A process can spawn new processes (&ldquo;children&rdquo;),
either by starting up an independent process via a system
call, or by <function>fork</function>-ing itself. (The &linux; kernel
uses a somewhat other approach, with the <function>clone()</function>
function, see <xref linkend="linux-tasks">.)
The forked process is a copy of the parent process, but it gets its
own memory, registers, file descriptors, and process identifier.
Starting a new process is a relatively heavy task for the operating
system, because memory has to be allocated, and lots of data
structures and code segments must be copied.
</para>

<para>
A thread is a &ldquo;lightweight&rdquo; process, in the sense that
different threads share the same address space. That is, they share
global and &ldquo;<function>static</function>&rdquo; variables, file
descriptors, signal bookkeeping, code area, and heap, but they have
their own thread status, program counter, registers, signal mask
(in &linux; but not in &unix;), and stack. The interesting fact from
an &rtos; point of view is that threads have shorter creation and
<emphasis>context switch<indexterm>
<primary>context switch</primary></indexterm></emphasis> times, and
faster &ipc; (see <xref linkend="ipc-synch">). A &ldquo;context
switch&rdquo; is the saving of the state of the currently running task
(registers, stack pointer, instruction pointer, etc.), and the
restoring of the state of the new task. Other advantages for using
multiple threads within a process are:
<itemizedlist>

<listitem>
<para>
The threads can be run on separate processors.
</para>
</listitem>

<listitem>
<para>
The tasks can be prioritized, so that a less important computation
can, in response to an external event, be suspended to process that
event.
</para>
</listitem>

<listitem>
<para>
Computation can occur in one thread, while waiting for an event, such
as the completion of I/O, can be outsourced to another thread.
</para>
</listitem>

</itemizedlist>
On the other hand, using threads requires functions to be made
&ldquo;thread-safe&rdquo;: when a function <function>func()</function>
is called in one thread, this thread can be pre-empted by another
thread, which, in turn, can call the same function; hence, this
function should not keep intermediate data in variables that are
shared between the different threads.
</para>

<para>
Many modern &cpu;s offer functionality such as floating-point
calculation, digital signal processing (e.g., &ldquo;MMX&rdquo;), or
on-chip memory caches. These functions require extra registers and/or
operations, so, when this extra functionality can be avoided,
real-time determinism is increased (because the context switch time is
lower if less registers have to be saved and restored). For example,
&linux; doesn't save the floating point registers for kernel tasks and
interrupt service routines.
</para>

</sect1>


<sect1 id="sect-posix-threads">
<title>&posix; thread management</title>

<para>
The &posix;<indexterm>
<primary>&posix;</primary></indexterm>
 operating system standard
(see <xref linkend="os-standards">)
has an extensive threads &api;, which all &unix;-like
operating systems implement (albeit to varying degrees). The thread
implementation in &linux; is not the most complete
The real-time operating systems discussed in later Chapters all have
decent, but not complete, &posix; thread functionality. The reasons
why many operating systems don't implement the full &posix; standards
are: (i) &posix; is not a single, rigid standard, but a large set of
complementary standards with different focus points; (ii) one doesn't
need the whole API to build functional and efficient software systems;
(iii) some parts of the standard require complicated implementations
with meager practical and not-unique advantages; (iv) some features
made it into the standard for the sole purpose of being backwards
compatible with older existing &unix; systems.
</para>

<para>
The &posix; &api; provides the following function calls (and others!)
for thread<indexterm>
<primary>thread</primary><secondary>&posix;</secondary></indexterm>
creation and deletion:
<programlisting>
<![CDATA[
int pthread_create(
  pthread_t *thread,               // thread data structure
  pthread_attr_t *attr,            // attributes data structure
  void *(*start_routine) (void *), // function to execute
  void *arg                        // argument to pass to function
);
void pthread_exit(void *retval);
int pthread_join(pthread_t thread, void **status),
int pthread_detach(pthread_t thread),
int pthread_cancel(),
]]>
</programlisting>
(The initial letter &ldquo;<function>p</function>&rdquo; indicates the
&posix; heritance.)
Thread creation involves some overhead, because memory has to be
allocated for the new thread; in a real-time setting, the memory also
has to be <emphasis>locked</emphasis> into RAM, in order to be sure
that no time will ever be lost because the memory pages have to be
swapped in from disk when needed. Similarly, freeing memory at thread
deletion is also an overhead. So, a real-time application should do
thread creation and deletion <emphasis>outside</emphasis> of the
real-time activity.
</para>

<para>
Other overhead caused by task management is: satisfying requested
changes in the timing or priority properties of tasks, and the
maintenance of the task queues at all priority levels when tasks are
woken up, put asleep, made running, obliged to wait for a blocking
&ipc; call, etc.
</para>

<para>
In the <function>pthread_create()</function>, the programmer can
specify the <emphasis>run-time priority</emphasis> of the task, as
well as the scheduling policy to use, through the
<function>pthread_setschedparam()</function> function call.
</para>
<para>
<function>pthread_join()</function>,
<function>pthread_detach()</function>, and
<function>pthread_cancel()</function>
are different ways to end
the execution of a task. A task should indeed not be deleted
blindly, because it shares a lot of its components with other
tasks, so its memory space and locks should not be released when its
cousin tasks are still using them. Especially
<emphasis>cancelling</emphasis> a thread from within another thread is
dangerous: it practically impossible to tell what resources the
cancelled task is holding (including locks!). So, &posix; prescribes
a procedure to cancel tasks. 
<function>pthread_cancel()</function> does not cancel the task
immediately, but is only a <emphasis>request</emphasis> to the
operating system to cancel the task. How the task is cancelled
depends on how the task initialised its own cancellation policy,
via:
<itemizedlist>

<listitem>
<para>
<function>int pthread_setcancelstate(int state, int *oldstate)</function>:
atomically sets the calling task's cancellability
<emphasis>state</emphasis> to the indicated
<parameter>state</parameter> and returns the previous cancellability
state in <parameter>oldstate</parameter>. Possible values for 
<parameter>state</parameter> are
<parameter>PTHREAD_CANCEL_ENABLE</parameter> and
<parameter>PTHREAD_CANCEL_DISABLE</parameter>.
</para>
</listitem>

<listitem>
<para>
<function>int pthread_setcanceltype(int type, int *oldtype)</function>:
atomically sets the calling task's cancellability
<emphasis>type</emphasis> to the indicated <parameter>type</parameter>
and returns the previous cancellability type in
<parameter>oldtype</parameter>.  Possible values for
<parameter>type</parameter> are
<parameter>PTHREAD_CANCEL_DEFERRED</parameter> and
<parameter>PTHREAD_CANCEL_ASYNCHRONOUS</parameter>. 
</para>
</listitem>

</itemizedlist>
The default cancellation type and state are 
<parameter>PTHREAD_CANCEL_DEFERRED</parameter> and
<parameter>PTHREAD_CANCEL_ENABLE</parameter>.
</para>
<para>
Cancellation happens immediately if the task has chosen the
<parameter>PTHREAD_CANCEL_ASYNCHRONOUS</parameter> policy; so, this
policy should only be chosen when the programmer is certain that the
task can be killed at any time, without compromising the rest of the
system. If the task has chosen the 
<parameter>PTHREAD_CANCEL_DEFERRED</parameter> policy,
it is cancelled only when it reaches a so-called
<emphasis>cancellation point<indexterm>
<primary>cancellation point</primary></indexterm></emphasis>.
These OS-dependent points are function calls where the task
tests whether it has received a cancellation request. (Or rather, the
operating system does the test for it, as well as the cancellation
handling, discussed below.) Cancellation function calls are typically
calls that might block for a long time, such that the OS need only
check for pending cancellation requests when the operation is about to
block indefinitely. This includes, but is not at all limited to,
<function>pthread_cond_wait()</function>,
<function>pthread_cond_timedwait(()</function>, or
<function>sem_wait()</function>, or
<function>sigwait()</function>.
</para>
<para>
The task that one wants to cancel can postpone cancellation in order
to perform application-specific cleanup processing.
It does this by &ldquo;pushing&rdquo; cancellation
<emphasis>cleanup handlers<indexterm>
<primary>cleanup handler</primary></indexterm></emphasis>
<indexterm>
 <primary>cancellation</primary><secondary>cleanup handler</secondary>
</indexterm>
every time that it acquires some resource. As the task leaves the
last cancellation point before releasing a resource, it needs to
&ldquo;pop&rdquo; the cleanup handler it had pushed earlier for this
resource. Pushing and popping is done by the
<function>pthread_cleanup_push()</function>
and <function>pthread_cleanup_pop()</function> function calls.
Every cleanup handler that is still on the cleanup stack is
invoked (in <emphasis>Last-in, First-Out</emphasis> order) when the
task is cancelled, and its job is to cleanly release the resource.
The task terminates when the last cleanup handler
returns. The task exit status returned by
<function>pthread_join()</function> on a cancelled task is
<parameter>PTHREAD_CANCELED</parameter>.
</para>
<para>
(This behaviour is quite standard in many software tasks;
<xref linkend="sect-events"> gives the generic software design
behind such behaviour.)
</para>
<para>
The cancellation procedures above might seem a bit involved, but
that's due to the complexity of the problem one wants to solve: making
sure that another task exits without blocking other tasks.  Anyway,
this kind of cancellation should be avoided whenever possible.  The
clean solution is to let all tasks in your application react to a
<emphasis>condition variable<indexterm>
<primary>condition variable</primary></indexterm></emphasis>
that indicates that it must shut down itself
(<xref linkend="event-loop-exit">).
</para>

<para>
An &rtos; must also allow to specify the <emphasis>timing</emphasis>
with which threads have to run. One typically uses two timing modes:
<itemizedlist>
 <listitem>
 <para>
<emphasis>Periodic:</emphasis><indexterm>
<primary>periodic timing</primary></indexterm>
<indexterm>
 <primary>timing</primary><secondary>periodic</secondary>
</indexterm>
 the task must run at regular intervals.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>One-shot:</emphasis><indexterm>
<primary>one-shot timing</primary></indexterm>
<indexterm>
 <primary>timing</primary><secondary>one-shot</secondary>
</indexterm>
the task must run only once, at a predefined instant in time.
 </para>
 </listitem>
</itemizedlist>
One-shot timing sometimes requires a bit more overhead, because of a
more involved hardware timer programming.  &posix; has no standardized
function calls for <emphasis>periodic</emphasis> timing. The reasons
are that: (i) there are multiple ways in which the desired
functionality can be programmed with already existing &posix;
primitives; and (ii) most applications have to break the periodic loop
in one way or another anyhow, depending on application-specific conditions.
Because of the lack of a (&posix;) standard API for periodic thread
timing, different operating systems implemented the functions on their
own, such that application programs will most probably have
portability
problems in this area. For example, &rtlinux; uses 
<function>pthread_make_periodic_np()</function> for both options (the
<function>_np</function> suffix stands for &ldquo;non-portable&rdquo;),
while &rtai; has <function>rt_set_periodic_mode()</function>
and <function>rt_set_oneshot_mode()</function>.
</para>

<para>
As examples of alternatives for the periodic timing function, &posix;
provides the <function>usleep()</function> and
<function>nanosleep()</function> function calls. These put tasks
asleep with a high timing resolution (microsecond, respectively
nanoseconds). The achievable resolution depends of course on the type
of &cpu;.
</para>

<para>
Some other often-used functionality that &posix; has not standardized
is: to allow the use of floating point operations in a thread (for
which, e.g., &rtlinux; has introduced
<function>pthread_setfp_np()</function>); to suspend execution of
<emphasis>another</emphasis> thread than the one that executes the
function
(&ldquo;<function>pthread_suspend_np(another_thread)</function>&rdquo;);
and
&ldquo;<function>pthread_wakeup_np(another_thread)</function>&rdquo;
to resume execution of the other thread. Note again the 
&ldquo;<function>&hellip;_np</function>&rdquo; suffix.
</para>

<para>
The floating point selection option was considered too low level and
hardware dependent to put into the &posix; standard. Saving a couple
of registers more or less is more of a matter of
<emphasis>optimization</emphasis>, and such things don't belong in a
standard. The &linux; scheduler, for example, always saves floating
point registers of <emphasis>user space processes</emphasis> by
default.
</para>

<para>
The <function>pthread_suspend_np()</function> and
<function>pthread_wakeup_np()</function> functions are
<emphasis>dangerous</emphasis> (see below), and the &posix; committee
had very good reasons not to include them in the standard. However,
many users think they are &ldquo;user-friendly&rdquo;, because they
sometimes save them a lot of keystrokes. The danger of
<function>pthread_suspend_np()</function> is that, while its use is
convenient to stop a thread, it leaves that thread most probably in an
undefined state, such that it's hard to predict what the thread is
going to do when <function>pthread_wakeup_np()</function> starts it
again!
</para>

<para>
The proper way of suspending the execution of a thread is to let
the thread do it <emphasis>itself</emphasis>, at a moment it is ready
to do so, i.e., it is in a well-defined state, from which it can
restart in a deterministic way.  <xref linkend="hints"> gives some
more detailed examples.
</para>

</sect1>


<sect1 id="linux-tasks">
<title>Linux tasks and tasklets</title>
<para>
The above-mentioned distinction between &ldquo;process&rdquo; and
&ldquo;thread&rdquo; is not what Linus Torvalds has in mind. He
thinks the really important concept is the
<emphasis>Context of execution</emphasis>:<indexterm>
<primary>context of execution</primary></indexterm>
that includes things like &cpu; state (registers, etc.), memory
management state (page mappings), permission state (user ID, group
ID), code to execute, and various &ldquo;communication states&rdquo;
(open files, signal handlers, etc.). An email by Torvalds in which he
explains his (and hence &linux;'s) point of view can be found
<ulink
 url="http://www.uwsg.iu.edu/hypermail/linux/kernel/9608/0191.html">here
</ulink>.
&posix; threads are offered on &linux; as a
<emphasis>library</emphasis>, and basically only because of compliance
with the standard. Anyway, they are just one single possible way
to share context. And the &linux; kernel offers a more flexible
alternative: the <function>clone()</function> creates a new
&ldquo;task&rdquo;, with a large choice in what parts of the
<emphasis>context of execution</emphasis> one wants to share between
the new task and the task that creates it. See the corresponding man
page for more details.
</para>
<para>
Many operating systems provide another primitive besides threads
or processes, that programmers can use to execute functionality.
&linux; and &rtai; call it <emphasis>tasklets<indexterm>
<primary>tasklet</primary></indexterm></emphasis>,
<xref linkend="prio-spaces">. A tasklet is a
<emphasis>function</emphasis> whose execution can be asked for by any
kernel task, and that the operating system will execute
<emphasis>before</emphasis> it does its next scheduling. At that
moment, the OS executes these functions one by one. So, the important
features of tasklets are:
<itemizedlist>

<listitem>
<para>
They are a more &ldquo;lightweight&rdquo; primitive than tasks,
to execute functions outside of, and prior to, the normal scheduling.
of tasks.
</para>
</listitem>

<listitem>
<para>
They are not pre-empted by normal tasks.
</para>
</listitem>

</itemizedlist>
But tasklets <emphasis>can</emphasis> be pre-empted by interrupts,
because the kernel has enabled all hardware interrupts when it runs
the tasklets. Tasklets are typically only executed once, but some
operating systems (e.g., &rtai;) offer periodic execution of tasklets,
by registering them with a
<emphasis>timer<indexterm>
<primary>timer</primary></indexterm></emphasis>.
The tasklet primitive is also very useful as a so-called
<emphasis>Deferred Service Routine<indexterm>
<primary>Deferred Service Routine</primary></indexterm>
<indexterm>
 <primary>interrupt</primary>
 <secondary>Deferred Service Routine</secondary>
</indexterm><indexterm>
<primary>DSR</primary></indexterm>(DSR)</emphasis>,
<xref linkend="sect-idsr">.
</para>
</sect1>


<sect1 id="sched-prob">
<title>Scheduling</title>

<para>
Some texts make a distinction between <emphasis>scheduling</emphasis>
and <emphasis>dispatching</emphasis>, with dispatching being the
simplest of the two operations:
<itemizedlist>
 <listitem>
 <para>
<emphasis>Scheduling</emphasis>: determining the order and the timing
(i.e., the &ldquo;schedule&rdquo;) with which tasks should be run.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Dispatching</emphasis>: the dispatcher starts and stops
the tasks, i.e., it <emphasis>implements</emphasis> the schedule.
 </para>
 </listitem>
</itemizedlist>
This text only uses the term &ldquo;scheduling&rdquo;.
</para>

<para>
A primary responsibility of an &rtos; is to make sure that all tasks
meet their <emphasis>timing</emphasis> constraints. Timing constraints
come in
<link linkend="rtos-time-constraints">different flavours</link>
(deadline, zero execution time, &qos;), and for every task the
constraints can change over time. For example, a motion generator for
a mobile robot has much more constraints to take into account when it
navigates in an environment with many nearby obstacles, while its job
is much easier in open areas. Or, users of a multimedia server have
different &qos; requirements for editing one video stream than for the
editing and synchronization of several streams.
</para>

<para>
So, a &ldquo;one-size-fits-all&rdquo; scheduling algorithm does not
exist.  Although that is exactly what a general purpose operating
system hopes to offer. Hence, it should come as no surprise that there
is a vast literature on the theory of scheduling, accompanied by a
large variety of (un)implemented scheduling algorithms. A
theoretically optimal schedule can only be reached in the unlikely
situation of <emphasis>complete knowledge</emphasis> about the
processing, synchronization and communication requirements of each
task, and the processing and timing properties of the hardware. This
state of complete knowledge is seldom reached in real-world
applications, especially when the requirements are
<emphasis>dynamic</emphasis> (i.e., time varying).  And even with
complete predictability, the general scheduling problem is
<emphasis>NP-complete</emphasis>, which means that its complexity
increases exponentially with the number of tasks and constraints
involved in the scheduling. And hence, the scheduling algorithms don't
scale well under a growing load and/or hardware resources. This does
<emphasis>not</emphasis> imply, however, that the problem is
infeasible for applications with only a few, well-defined tasks.
</para>

<para>
Each OS has a scheduler <emphasis>function<indexterm>
<primary>scheduler function</primary></indexterm>
</emphasis> (let's call it &schedf;), that
implements the scheduling algorithm. (Later 
<link linkend="sched-linux">sections</link> discuss the most
common scheduling algorithms.) 
This scheduler is <emphasis>not</emphasis> a task in itself: it is a
function call, that is called at various points in the kernel. These
points are, not surprisingly, called
<emphasis>scheduling points.<indexterm>
<primary>scheduling point</primary></indexterm></emphasis>
Typical scheduling points are: end of interrupt service routines
(<xref linkend="sect-inter-sw">), the moments when tasks want to go
to sleep for one reason or another, or when they become ready to run.
</para>
<para>
Scheduling is pure overhead: all time spent on calculating which task
to run next is lost for the really productive tasks. And trying to use
more optimal schedulers isn't always a clever &ldquo;solution&rdquo;:
advanced schedulers consume (often unpredictably!) more time and
resources, and their increased complexity makes it more difficult for
programmers to work with. Hence, the chance that those programmers
make the wrong design decisions increases.  Simplicity is especially
a key feature for real-time and embedded systems; complex schedulers
appear more in Operations Research applications, where the scheduling
problem and its algorithmic complexity are comparable to the operating
system case, but where the real-time constraints and the
predictability of the cost of tasks are more manageable.
</para>
<para>
(TODO: explain (&posix;) cancellation points: why are they needed?
what makes a point a valid cancellation point? Warn against using
cancellation, because it's so error prone. Not only from the OS point
of view (that OS must make sure its thread and lock bookkeeping
remains consistent, which is not a simple job), but also from the
application point of view (how do you make sure that there is no race
between one thread trying to cancel another thread, and a third thread
that still wants to interact with that to-be-cancelled thread? It's
way better to have each thread exit itself explicitly, and to have an
explicit exit condition for each thread. And to make thread
interaction <emphasis>asynchronous</emphasis>.)
</para>


</sect1>


<sect1 id="prior-sched">
<title>Priority-based scheduling</title>

<para>
<anchor id="stat-prior-sched">
The <emphasis>simplest</emphasis> approach to the scheduling problem
is to assign <emphasis>static priorities<indexterm>
<primary>static priority scheduling</primary></indexterm></emphasis>
<indexterm>
 <primary>scheduling</primary><secondary>static priority</secondary>
</indexterm>
<indexterm>
 <primary>priority</primary>
 <secondary>static priority scheduling</secondary>
</indexterm>
to all tasks. That means that the priority is given to the task at the
time it is created. The scheduler function &schedf; is then very
simple, because it looks at all wait queus at each priority level, and
starts the task with the highest priority that is ready to run.
</para>
<para>
Using priorities implies using <emphasis>pre-emption<indexterm>
<primary>pre-emption</primary></indexterm></emphasis>:
&schedf; interrupts a lower priority task in order to run a 
higher priority task that requests it. Pre-emption means that the
running task's context is switched out, and the new task's context is
switched in.
</para>

<para>
One classifies priorities into <emphasis>statically</emphasis> and
<emphasis>dynamically</emphasis> assigned priorities. In the former
case, a task is given a priority by the programmer at design time (or
by the operator at system initialization time), and it keeps
this priority during its whole lifetime. In the dynamic case, &schedf;
becomes more complex, because it has to calculate the task's priority
on-line, based on a number of dynamically changing parameters (time
till next deadline; amount of work to process; etc.). As described
before, the optimal solution to a scheduling problem is usually
impossible to find, so scheduling is often based on a set of
<emphasis>heuristics</emphasis>. This is the case for real-time as
well as non-real-time schedulers. The heuristics in a general
purpose OS can be quite involved, but real-time and embedded operating
systems mostly use simple heuristics. Because &ldquo;simple&rdquo;
means: faster and smaller and more predictable! Examples of such
simple dynamic scheduling algorithms, that are sometimes used to
replace static priority scheduling, are:
<itemizedlist>
 <listitem>
  <para>
<emphasis>Rate monotonic<indexterm> 
<primary>rate monotonic</primary></indexterm>
<indexterm>
 <primary>RM</primary> <secondary>rate monotonic</secondary>
</indexterm>
<indexterm>
 <primary>scheduling</primary>
 <secondary>rate monotonic</secondary>
</indexterm></emphasis> (<acronym>RM</acronym>). A task gets a
higher priority if it has to run more frequently. This is a common
approach in the case that <emphasis>all tasks are periodic</emphasis>.
So, a task that has to run every n milliseconds gets a higher priority
than a task that runs every m milliseconds when n&lt;m.  Hence,
changing the scheduling frequency of a task on-line also changes its
priority. The scheduler needs to know the periods of all tasks it has
to schedule.
  </para>
 </listitem>

 <listitem>
  <para>
<emphasis>Earliest deadline first<indexterm>
<primary>earliest deadline first</primary></indexterm><indexterm>
<primary>EDF</primary> <secondary>earliest deadline first</secondary>
</indexterm>
<indexterm>
 <primary>scheduling</primary>
 <secondary>earliest deadline first</secondary>
</indexterm></emphasis>
(<acronym>EDF</acronym>). At
each instant in time, there are a number of tasks that need to be
finished in the near future. A task with a closer deadline gets a
higher scheduling priority. The scheduler needs not only to know the
deadline time of all tasks it has to schedule, but also their duration.
  </para>
 </listitem>
</itemizedlist>
If different tasks in the system request different scheduling
policies, the operating system has to make trade-offs in determining
the relative &ldquo;weight&rdquo; to give to each of the scheduling
algorihtms. These trade-offs will most probably be quite arbitrary, so
porting your application between operating systems could lead to
different scheduling results.
</para>

<para>
Priority-based scheduling<indexterm>
<primary>scheduling</primary><secondary>coupling</secondary>
</indexterm>
<indexterm>
 <primary>scheduling</primary><secondary>priority-based</secondary>
</indexterm>
<indexterm>
 <primary>priority-based scheduling</primary>
</indexterm>
is simple to <emphasis>implement</emphasis>,
because &schedf; just has to look at the tasks in the highest priority
queue that are ready to be scheduled, and to start the first one in
this queue. Priority-based scheduling, however, is
<emphasis>difficult for the application programmers</emphasis>: they
must try to map the often complex (&ldquo;high-dimensional&rdquo;)
synchronization interdependencies between the different threads in
their application
onto the <emphasis>linear scale</emphasis> offered by priorities! One
often-observed phenomenon in real-time applications that grow over
time, is that the programmers tend to raise the priorities of some
threads, every time they notice that the introduction of new
functionality (and hence new threads) disturbs the synchronization of
the existing threads. <xref linkend="chap-design"> gives some more
examples of the negative effects of &ldquo;coupling&rdquo;, and
<xref linkend="chap-patterns"> discusses time-proven
approaches to take care of complex interdependencies.
</para>

<para>
So, the problem with priority-based scheduling is that it is an
<emphasis>indirect</emphasis> way to specify how to cope with timing and
synchronization constraints: at run-time, &schedf; doesn't take these
constraints themselves into account, but knows only about the priorities,
which are the programmer's indirect model of the constraints.
</para>

<para>
In practice, all &rtos;s at least offer static priority-based
scheduling.  Many also implement other algorithms. Not always because
of the intrinsic added value of the algorithm, but rather because of
typical marketing drives: users tend to buy software products with the
highest number of features, even if they risk to drown in the
complexity and &ldquo;feature bloat&rdquo; (whose implications they
often even don't understand&hellip;).  One of the more serious feature
bloat examples in priority-based scheduling is the so-called
<emphasis>priority inheritance</emphasis> &ldquo;solution&rdquo; to
the <emphasis>priority inversion</emphasis> phenomenon
(see <xref linkend="sect-prior-inv">), that occurs when tasks share
resources which they should not access concurrently.
</para>

</sect1>

<sect1 id="prio-spaces">
<title>Priority spaces</title>

<para>
Many operating systems (especially &rtos;s) let all tasks (system
tasks as well as user tasks) live in the same priority space: any task
can be given any priority within this space. Others, such as
&linux;, &unix, or &nt;, have separate priority spaces for different
kinds of tasks. &linux; has two: <emphasis>user space</emphasis> and
<emphasis>kernel space</emphasis>. Tasks running in user space can
change their priorities (through the <function>nice()</function>
function call), but all of them are pre-empted by any task in kernel
space. Kernel space itself has three priority levels:
<orderedlist>
 <listitem>
<para>
<emphasis>Interrupts<indexterm>
<primary>interrupt</primary></indexterm></emphasis>:
the &ldquo;task&rdquo; that services a hardware interrupt (timer,
network, keyboard, &hellip;) has the highest priority.  Such a task is
called an <emphasis>interrupt service routine (&isr;)</emphasis>.
(<xref linkend="sect-inter-sw">.)
It should be as short as possible, because it runs with all other
interrupts disabled. An &isr; is not really a task, but just a
<emphasis>function call</emphasis>, and its execution is not
determined by the scheduler: the &isr; is executed immediately at the
occurrence of an hardware interrupt, by the hardware of the interrupt
controller and the &cpu; (<xref linkend="sect-inter-hw">).
The operating system software is not involved at all.
</para>
 </listitem>

 <listitem>
<para>
<emphasis>Tasklet<indexterm>
<primary>tasklet</primary></indexterm></emphasis> functions
(&linux; specific, <xref linkend="sect-idsr">) and
<emphasis>Deferred Service Routines<indexterm>
<primary>Deferred Service Routine</primary></indexterm>
<indexterm>
 <primary>interrupt</primary>
 <secondary>Deferred Service Routine</secondary>
</indexterm></emphasis>
(terminology often used outside of &linux;) are
<emphasis>functions</emphasis>  (again,
<emphasis>not</emphasis> tasks!) that run at the second highest
priority. Only an hardware interrupt can pre-empt them. A tasklet can
be activated by any kernel task; a deferred
interrupt function (<xref linkend="sect-idsr">) is typically triggered
by a hardware interrupt service routine, to further process an
interrupt after the &isr; has finished. Both have the same properties,
and are executed after all hardware interrupt service routine have
finished, and before the &ldquo;normal&rdquo; tasks are scheduled;
interrupts are enabled when they run. In contrast to the hardware
interrupts, the operating system software <emphasis>is</emphasis>
involved in determining when they are executed.
</para>
 </listitem>

 <listitem>
<para>
<emphasis>All other kernel tasks</emphasis> run at the lowest
priority level in the kernel. They pre-empt every user space task.
</para>
 </listitem>
</orderedlist>
There is no consensus about the relative merits of having separate
user and kernel spaces: some consider it to be a design advantage
(<quote>divide et impera</quote>), while others experience it as an
unnecessarily artificial constraint on their flexibility.
</para>

</sect1>


<sect1 id="sched-linux">
<title>Linux scheduler</title>
<para>
The scheduler<indexterm>
<primary>&linux; scheduler</primary></indexterm>
<indexterm>
 <primary>scheduler</primary><secondary>&linux;</secondary>
</indexterm>
implemented in the file
<filename>/usr/src/linux/kernel/sched.c</filename> of the
&linux; source tree works with three scheduling modes (which are
defined in the &posix; standard):
<parameter>SCHED_RR</parameter>, <parameter>SCHED_FIFO</parameter> and
<parameter>SCHED_OTHER</parameter>. <parameter>SCHED_OTHER</parameter> is
the default. The scheduling mode of a task is set by the &posix;
<function>sched_setscheduler()</function> system call.
</para>
<para>
<parameter>SCHED_RR<indexterm>
<primary>SCHED_RR</primary></indexterm></parameter> is the
<emphasis>round-robin</emphasis> time slicing algorithm.
After a task finishes its time slice, it is moved to the tail of its
priority queue, such that another task in the same priority level can
start running.  If there is no other task at this priority, the
pre-empted task can continue.
</para>
<para>
<parameter>SCHED_FIFO<indexterm>
<primary>SCHED_FIFO</primary></indexterm></parameter> is a
<emphasis>First-In, First-Out</emphasis> scheduling algorithm: the
tasks in one priority level are scheduled in the order they get ready
to run; once a task is scheduled, it keeps the processor until
pre-empted by a higher priority task, until it releases the processor
voluntarily, or until it has to wait to get access to some resource.
This scheduler mode is often called &ldquo;&posix; soft
real-time&rdquo; because it corresponds to the most common real-time
scheduling approach with
<link linkend="stat-prior-sched">static priorities</link>, but without the
other
<link linkend="rtos-respon">necessary real-time components</link>.
</para>

<para>
The behaviour of the <parameter>SCHED_OTHER<indexterm>
<primary>SCHED_OTHER</primary></indexterm></parameter> scheduler
function is not prescribed by the &posix; standard. It is meant to
give freedom to the operating system programmers to implement their
own scheduling algorithm. In &linux;, as in all general-purpose
operating systems, the <parameter>SCHED_OTHER</parameter> scheduler
function tries to combine two conflicting performance measures:
maximimum throughput and good response to interactive users. The
&linux; scheduler calculates a &ldquo;goodness&rdquo;
value for each candidate task, based on a number of
<emphasis>heuristic rules</emphasis>. Recently, the scheduler function
got a lot of attention from the &linux; kernel developers, since a new
<emphasis>O(1)</emphasis> (&ldquo;order one&rdquo;) scheduling
algorithm was introduced. <emphasis>O(1)</emphasis> means that the
function's computational time does not increase 
with the number of tasks that must be scheduled.
This has led to a more responsive kernel, certainly in
combination with the increased number of pre-emption points
(<xref linkend="linux-preempt">), which all lead to a call to the
scheduler function.
</para>

<para>
Kernel tasks with the <parameter>SCHED_OTHER</parameter> scheduling
policy receive the lowest priority, &ldquo;0&rdquo;, while the
<parameter>SCHED_RR</parameter> and <parameter>SCHED_FIFO</parameter>
policies can use priority levels from &ldquo;1&rdquo; to
&ldquo;99&rdquo;. User space tasks are always scheduled with the
<parameter>SCHED_OTHER</parameter> policy.  The priority levels 0 to
99 are prescribe in the &posix; standard, and the portable &posix; way
to find out about the minimum and maximum scheduling priorities is
through the <function>sched_get_priority_min()</function>
and 
<function>sched_get_priority_max()</function>
system calls. Both take one of the priority policies as their argument.
</para>

<para>
The scheduling for <emphasis>Symmetric Multi-Processor</emphasis>
(<acronym>SMP</acronym>) systems is basically the same as for
the uni-processor case. There are some extra function calls to assign
a task or an interrupt to a specific processor, if the programmers
desires so. This decision could lead to more efficient execution,
because it increases the chance that the task's or &isr;'s code can
permanently be kept in the cache of that particular processor.
</para>

</sect1>

<sect1 id="linux-preempt">
<title>Linux real-time scheduling</title>
<para><indexterm>
<primary>&linux;</primary><secondary>soft real time</secondary></indexterm>
<indexterm><primary>soft real time</primary></indexterm>
&linux; will not become a full-fledged &rtos;, for the simple reason
that the requirements for a general-purpose operating system are very
different from those of an &rtos;. However, soft real-time additions
to the standard &linux; kernel have been developed in several places.
</para>

<para>
One active source of soft real-time efforts
has been the audio and video community: in this area, &linux; and
&nt;<indexterm>
<primary>&nt;</primary></indexterm>
perform poorly, in comparison to, for example, &beos;<indexterm>
<primary>&beos;</primary></indexterm>
and &win95;.<indexterm>
<primary>&win95;</primary></indexterm>
The reason is that &linux; and &nt; can't guarantee these multi-media
tasks a deterministic share of the resources (&qos;). &beos; does
offer &qos; scheduling, while &win95; simply has much less things to
do than a &ldquo;real&rdquo; operating system&hellip;.
</para>

<para>
Another reason for soft real-time work is the drive to make &linux;
scale better on multi-processor systems. In this context, it is
important to keep the locks on kernel functionality as small as
possible, because if one processor needs a lock, the other processors
are also disturbed in their activity. The expectation is that the
scalability activity will make &linux; into an operating system that
can almost guarantee milli-second deadlines (i.e., &ldquo;soft real
time&rdquo;), without making it into a real &rtos;.
</para>

<para>
Here is a (non-exhaustive) list of efforts to improve on latency
problems in the &linux; kernel:
<itemizedlist>
 <listitem>
 <para>
<ulink url="http://www.mvista.com/products/hhl.html">Montavista's<indexterm>
<primary>Montavista</primary></indexterm>
Hard Hat Linux</ulink> with its
<emphasis>
<ulink
 url="http://www.mvista.com/dswp/PreemptibleLinux.pdf">pre-emption
</ulink></emphasis><indexterm>
<primary>pre-emption patches</primary></indexterm>
patches. These are currently maintained by 
<ulink url="http://www.tech9.net/rml/linux/">Robert Love</ulink>, and
gradually introduced in the new 2.5.x kernels. The idea is to see
whether the scheduler could run, at the moment of that a kernel
spinlock (<xref linkend="sect-spinlock">) is released, or
an interrupt routine (<xref linkend="interrupts">) exits. 
Commands exist to disable or enable kernel pre-emption.
 </para>
</listitem>

<listitem>
 <para>
Ingo Molnar<indexterm><primary>Milnar, Ingo</primary></indexterm>'s
<emphasis>low-latency</emphasis><indexterm>
<primary>low-latency patches</primary></indexterm>
patches, now maintained by 
<ulink
 url="http://www.zipworld.com.au/~akpm/">Andrew Morton<indexterm>
 <primary>Morton, Andrew</primary></indexterm>
</ulink>.
They introduce
<ulink
 url="http://www.linuxdevices.com/articles/AT8906594941.html">more
scheduling points<indexterm>
<primary>scheduling point</primary></indexterm></ulink>
in the kernel code, such that the time is reduced between the
occurrence of an event that requires rescheduling and the actual
rescheduling. Probably, this work will be combined with the
pre-emption work mentioned above.
 </para>
 </listitem>

 <listitem>
 <para>
<application><ulink url="http://www.timesys.com">TimeSys Linux/RT</ulink>
</application> 
develops and commercializes the so-called
&ldquo;Resource Kernel&rdquo; loadable module, that makes the standard
&linux; kernel pre-emptable, and that allows to build &qos; scheduling
for user tasks. 
 </para>
 </listitem>

 <listitem>
 <para>
<ulink url="http://www.ittc.ukans.edu/kurt/">&kurt;<indexterm>
<primary>&kurt;</primary></indexterm></ulink>
(<emphasis>Kansas University Real-Time Linux</emphasis>). 
&kurt; &linux; allows for explicit scheduling of any real-time
<emphasis>events</emphasis> rather than just
<emphasis>tasks</emphasis>. This provides a more generic framework
onto which normal real-time process scheduling is mapped. Since event
scheduling is handled by the system, addition of new events such as
periodic sampling data acquisition cards (video, lab equipment, etc.)
is highly simplified. 

&kurt; introduces two modes of operation: the normal mode and the
real-time mode. In normal mode, the system acts as a generic &linux;
system.  When the kernel is running in real-time mode, it only
executes real-time processes. While in real-time mode, the system can
no longer be used as a generic workstation, as all of its resources
are dedicated to executing its real-time responsibilities as
accurately as possible. 
 </para>
 </listitem>

 <listitem>
 <para>
 The <ulink url="http://www.linuxbios.org">LinuxBIOS</ulink> project
allows to get rid of the usual <emphasis>BIOS<indexterm>
<primary>BIOS</primary></indexterm> chips that manage part of the
hardware, and that introduce significant delays when booting. A
LinuxBIOS startup can take place in a few seconds, booting immediate
in a ready-to-go kernel.
</emphasis>
</para>
 </listitem>

 <listitem>
 <para>
 <ulink url="http://www.uk.research.att.com/~dmi/linux-srt/">Linux-SRT</ulink>
is a <link linkend="sect-qos">&qos;</link> scheduler.
</para>
 </listitem>

 <listitem>
 <para>
<ulink url="http://www.cs.umass.edu/~lass/software/qlinux/">QLinux</ulink>
is a <link linkend="sect-qos">&qos;</link> scheduler.
 </para>
 </listitem>

 <listitem>
 <para>
<ulink url="http://fairsched.sourceforge.net/">Fairsched</ulink>
is a hierarchical &qos; scheduler:
tasks are divided into groups and each <emphasis>group</emphasis> receives
guaranteed &cpu; time allocation proportional to its weight.  The standard
scheduler is used to schedule processes within a group. 
 </para>
 </listitem>

 <listitem>
 <para>
 <ulink url="http://www.cc.gatech.edu/~west/dwcs.html">DWCS</ulink>
(<emphasis>Dynamic Window-Constrained Scheduling</emphasis>) is a &qos;
scheduler, parameterizing the service in terms of a <emphasis>request
period</emphasis> and a <emphasis>window constraint</emphasis>. The request
period is the time interval over which a task must receive some share of
the &cpu;; the window constraint is the value of that minimum share the
task much receive during this &ldquo;window.&rdquo;
 </para>
 </listitem>

</itemizedlist>
</para>

</sect1>

</chapter>


<chapter id="interrupts">
<title>Interrupts</title>
<para>
This Chapter explains the basics of interrupt servicing in a computer
system, with again an emphasis on the real-time application.
Interrupt hardware and software come in a great variety of
implementations and functionalities, so some of the concepts talked
about in this Chapter may not be relevant to your system.
</para>

<sect1 id="sect-int-basics">
<title>Introduction</title>
<para>
Interrupts are indispensable in most computer systems with real-time
ambitions. Interrupts have to be processed by a so-called &isr;
(<emphasis>Interrupt Service Routine</emphasis>). The faster this
&isr; can do its job, the better the real-time performance of the
&rtos;, because other tasks are delayed less.  Timers are one example
of peripheral devices that generate interrupts; other such devices are
the keyboard, DAQ (<emphasis>Digital AcQuisition</emphasis>) cards,
video cards, the serial and parallel ports, etc. Also the processor
itself can generate interrupts, e.g., to switch to the
&ldquo;protected mode&rdquo; of the processor, when executing an
illegal operation, as part of a debugging session, or when an
&ldquo;exception&rdquo; is raised by an application program.
</para>

</sect1>


<sect1 id="sect-inter-hw">
<title>Interrupt hardware</title>
<para>
An interrupt-driven system (which many &rtos;s and &eos;s are)
typically has one or more of the following hardware components:
<itemizedlist>

 <listitem>
 <para>
<emphasis>Interrupt vector<indexterm>
<primary>interrupt vector</primary></indexterm>.</emphasis>
Many systems have more than one hardware interrupt line (also called
<emphasis>interrupt request (&irq;)</emphasis>,<indexterm>
<primary>interrupt request</primary></indexterm>
<indexterm><primary>&irq;</primary></indexterm>
and the hardware manufacturer typically assembles all these interrupt
lines in an &ldquo;interrupt vector&rdquo;.
The <acronym>INTEL</acronym> <acronym>80x86</acronym> processors'
interrupt vector contains 256 entries, and is called the
<emphasis>Interrupt Description Table<indexterm>
<primary>Interrupt Description Table</primary></indexterm>
 (IDT)<indexterm>
<primary>IDT</primary></indexterm></emphasis>,
<citation>Hyde97</citation>.
(But most PCs manufacturers make only 16 of these interrupts available
as <emphasis>hardware</emphasis> interrupts! See below.)
The interrupt vector is an array of pointers to the interrupt
service routines (<xref linkend="sect-inter-sw">)
that will be triggered when the corresponding
interrupt occurs. The vector also contains a bit for each interrupt
line that signals whether there is an interrupt
<emphasis>pending</emphasis> on that line, i.e., a peripheral device
has raised the interrupt, and is waiting to be serviced.
 </para>

 <para>
Some processors use <emphasis>non-vectored</emphasis> interrupt
processing: when an interrupt occurs, control is transfered to one
single routine, that has to decide what to do with the interrupt. The
same strategy is also used, in software, in most operating systems to
allow multiple devices to share the same interrupt.
 </para>
 </listitem>

 <listitem>
 <para>
 <emphasis>Synchronous or software interrupt</emphasis>.
A synchronous interrupt<indexterm>
<primary>synchronous interrupt</primary></indexterm>
<indexterm>
 <primary>interrupt</primary><secondary>synchronous</secondary>
</indexterm>
(also called a software interrupt<indexterm>
<primary>software interrupt</primary></indexterm>
<indexterm>
 <primary>interrupt</primary><secondary>software</secondary>
</indexterm>, or a
trap<indexterm><primary>trap</primary></indexterm>)
is an interrupt that is not caused by an (asynchronous) hardware
event, but by a specific (synchronous)
<emphasis>machine language operation code</emphasis>. Such as, for
example the <function>trap</function> in the
<acronym>Motorola 68000</acronym>, the <function>swi</function> in the
<acronym>ARM</acronym>, the <function>int</function> in the
<acronym>Intel 80x86</acronym>, by a divide by zero, a memory
segmentation fault, etc.  Since this feature is supported in the
<emphasis>hardware</emphasis>, one can expect a large number of
different, not standardized, names and functions for software
interrupts&hellip;
</para>
<para>
Major differences between asynchronous/hardware interrupts and
synchronous/software interrupts, on most hardware, is that:
<orderedlist>

<listitem>
<para>
Further interrupts are <emphasis>disabled</emphasis> as soon as an
hardware interrupt comes in, but not disabled in the case of a
software interrupt.
</para>
</listitem>

<listitem>
<para>
The handler of a software interrupt runs in the context of the
interrupting task; the &isr; of an hardware interrupt has not
connected task context to run in. So, the OS provides a context (that
<emphasis>can</emphasis> be the context of the task that happened to
be running at the time of the interrupt).
</para>
</listitem>

</orderedlist>
Hardware and software interrupts do share the same interrupt vector,
but that vector then provides separate ranges for hardware and
software interrupts.
</para>

 </listitem>

 <listitem>
<para>
<emphasis>Edge-triggered<indexterm>
<primary>edge-triggered interrupt</primary></indexterm>
<indexterm>
 <primary>interrupt</primary><secondary>edge-triggered</secondary>
</indexterm>
and level-triggered<indexterm>
 <primary>level-triggered interrupt</primary></indexterm>
<indexterm>
 <primary>interrupt</primary><secondary>level-triggered</secondary>
</indexterm>
interrupts.</emphasis>
From a hardware point of view, peripheral devices can transmit their
interrupt signals in basically two different ways:
<itemizedlist>
<listitem>
<para>
<emphasis>Edge-triggered</emphasis>.
An interrupt is sent when the interrupt line changes from low to high,
or vice versa. That is a almost &ldquo;zero time&rdquo; event, which
increases the chances for a <emphasis>hardware</emphasis> loss of
interrupts by the interrupt controller. Moreover, if multiple devices
are connected to the same interrupt line, the operating system
<emphasis>must</emphasis> call all registered interrupt service
routines (see <xref linkend="sect-inter-sw">), because otherwise it
could cause a <emphasis>software</emphasis> loss of an interrupt: even
if it detected only one edge transition, and its first &isr;
acknowledged the receipt of this interrupt, it could still be that it
missed another edge transition, so it can only be sure after it has
given all &isr;s the chance to work. But of course, this is not an
efficient situation.
</para>
</listitem>
<listitem>
<para>
<emphasis>Level-triggered</emphasis>.
An interrupt is signaled by a change in the <emphasis>level</emphasis>
on the hardware interrupt line. This not only lowers the chance of
missing a transition, but it also allows a more efficient servicing
of the interrupts: each &isr; that has serviced the interrupt will
acknowledge its peripheral device, which will take away its
contribution to the interrupt line. So, the level will change again
after the last peripheral device has been serviced. And the operating
system should not try all &isr;s connected to the same hardware
interrupt line.
</para>
</listitem>
</itemizedlist>
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Interrupt controller.<indexterm>
<primary>interrupt controller</primary></indexterm></emphasis>
This is a piece of hardware
that shields the operating system from the electronic details of the
interrupt lines.  Some controllers are able to
<emphasis>queue</emphasis> interrupts, such that none of them gets
lost (up to a given hardware limit, of course). Some allow various
ways of configuring <emphasis>priorities</emphasis> on the different
interrupts.
</para>

<para>
The
<ulink url="http://www.cast-inc.com/cores/c8259a/c8259a-x.pdf">8259</ulink>
<emphasis>Programmable Interrupt Controller<indexterm>
<primary>Programmable Interrupt Controller</primary></indexterm>
</emphasis> (<acronym>PIC<indexterm>
<primary>PIC</primary></indexterm></acronym>) is still a very common
chip for this job on PC architectures, despite its age of more than 25
years. PC builders usually use two PICs,
since each one can cope with only eight interrupts. But using more
than one <emphasis>has</emphasis> to happen in a
<emphasis>daisy chain<indexterm>
<primary>interrupt</primary><secondary>daisy chain</secondary>
</indexterm></emphasis>, i.e., the interrupt output pin of
the first PIC is connected to an input pin of the second one; this
introduces delays in the interrupt servicing. As another disadvantage,
PICs were not designed to be used in multiprocessor systems.
</para>

<para>
Higher-quality and/or SMP motherboards use the <emphasis>Advanced
Programmable Interrupt Controller<indexterm><primary>Advanced
Programmable Interrupt Controller</primary></indexterm> </emphasis>
(<acronym>APIC<indexterm><primary>APIC</primary></indexterm></acronym>).
<indexterm><primary>interrupt controller</primary>
<secondary>APIC</secondary></indexterm>
This is not just a single chip, but a small hardware system that
manages interrupts: 
<itemizedlist>
<listitem>
<para>
Each &cpu; must have a &ldquo;local APIC&rdquo;
with which it gets interrupts from the APIC system.
</para>
</listitem>
<listitem>
<para>
The peripheral hardware connects its interrupt line to the
<emphasis>I/O APIC</emphasis>. (There can be eight of them.) An I/O
APIC then sends a signal to the local APIC of the &cpu; for which the
interrupt is meant.
</para>
</listitem>
</itemizedlist>
The APIC architecture is better than the PIC, because (i) it can have
many more interrupts lines, hence eliminating the need to share
interrupts, (ii) it knows programmable interrupt priorities, (iii) it
is faster to program (only one machine instruction to the local APIC'c
Task Priority Register (which is <emphasis>on</emphasis> the &cpu;!),
instead of two to the PIC, which in addition is not on the &cpu;) and
(iv) it allows to work with
<emphasis>level-triggered interrupts</emphasis> instead of with
<emphasis>edge-triggered interrupts</emphasis>.
The &pci; bus uses active low, level-triggered interrupts, so can work
fine together with APIC.
 </para>
 <para>
The PowerPC platforms have another interrupt hardware standard, the
<emphasis><ulink
url="http://www.itis.mn.it/inform/materiali/evarchi/cyrix.dir/opnparc.htm">OpenPIC<indexterm>
<primary>OpenPIC</primary></indexterm></ulink></emphasis>, which also guarantees a high hardware
<indexterm><primary>interrupt controller</primary><secondary>OpenPIC</secondary></indexterm>
quality. OpenPIC also works with x86 architectures.
 </para>
 </listitem>

</itemizedlist>
</para>

</sect1>


<sect1 id="sect-inter-sw">
<title>Interrupt software</title>
<para>
From the software side, an interrupt-driven system must typically
take into account one or more of the following software issues:
<itemizedlist>

 <listitem>
 <para>
<emphasis>Interrupt Service Routine<indexterm>
<primary>Interrupt Service Routine</primary></indexterm>
</emphasis> (&isr;<indexterm><primary>&isr;</primary></indexterm>),
often called <emphasis>interrupt handler<indexterm>
<primary>interrupt handler</primary></indexterm>
<indexterm>
 <primary>handler</primary><secondary>interrupt</secondary>
</indexterm></emphasis> tout court.
This software routine is called when an interrupt occurs on the
interrupt line for which the &isr; has been
<emphasis>registered</emphasis> in the interrupt vector. Typically,
this registration takes place through a system call to the operating
system, but it can also be done directly in a machine instruction, by
a sufficiently privileged program. The registration puts the address
of the function to be called by the interrupt, in the address field
provided in the interrupt vector at the index of the corresponding
interrupt number.
</para>

<para>
The operating system does not (or rather, cannot) intervene in the
launching of the &isr;, because everything is done by the &cpu;. The
context of the currently running task is saved on the stack of that
task: its address is in one of the &cpu; registers, and it is the only
stack that the &cpu; has immediate and automatic access to. This fact
has influence on the software configuration of the system: each task
must get enough stack space to cope with &isr; overhead. So, the
worst-case amount of extra stack space to be foreseen in a task's
memory budget can grow large, especially for systems in which
interrupts can be nested. More and more operating systems, however,
provide a separate &ldquo;context&rdquo;<indexterm>
<primary>context</primary><secondary>&isr;</secondary></indexterm>
 for interrupt servicing,
shared by <emphasis>all</emphasis> &isr;s;
examples are &linux; and &vxworks;.
</para>

<para>
An &isr; should be as short as possible, because it runs with
interrupts disabled, which prevents other interrupts from being
serviced, and, hence, other tasks from proceeding. The &isr; should
service the peripheral device it was triggered by, and then return.
This servicing typically consists of reading or writing some registers
on the device, and buffer them in a place where some other task can
process them further, outside of the &isr; and hence with interrupts
enabled again. This further processing is the goal of the &dsr;
(Deferred Service Routine), <xref linkend="sect-idsr">. Getting the
data from the &isr; to the &dsr; should be done in a
<emphasis>non-blocking</emphasis> way; &fifo;s
(<xref linkend="sect-fifo">) or circular buffers 
(<xref linkend="sect-circ-buf">) are often used for this purpose.
</para>
 </listitem>

 <listitem>
<para>
<emphasis>Trap handler/service request.</emphasis>
A synchronous interrupt is sometimes also called a
<emphasis>
<ulink
 url="http://www.osdata.com/topic/language/asm/trapgen.htm">trap</ulink>
<indexterm><primary>trap</primary></indexterm>
<indexterm>
 <primary>interrupt</primary><secondary>trap</secondary>
</indexterm></emphasis> or a 
<emphasis>software interrupt</emphasis>.<indexterm>
 <primary>software interrupt</primary></indexterm>
<indexterm>
 <primary>interrupt</primary><secondary>software</secondary>
</indexterm>
The software interrupts are &ldquo;called&rdquo; by the
processor itself, such as in the case of register overflow, page
address errors, etc. They work like a hardware interrupts (saving
state, switching to protected mode, jumping to handler), but they run
with the hardware interrupts <emphasis>enabled</emphasis>.
 </para>
 <para>
These software interrupts are very important, because they are the
only means with which user space tasks can execute
&ldquo;protected operations<indexterm>
 <primary>protected operation</primary></indexterm>,&rdquo;
or &ldquo;privileged instructions<indexterm>
<primary>privileged instruction</primary></indexterm>.&rdquo;
These privileged instructions can only be executed when the processor
is in its <emphasis>protected mode<indexterm>
<primary>protected mode</primary></indexterm></emphasis> (also called
<emphasis>privileged mode<indexterm>
<primary>privileged mode</primary></indexterm></emphasis>).
Privileged instructions are operations such as: to address physical
IO directly; to work with the memory management infrastructure such as
the page lookup table; to disable and enable interrupts; or to halt
the machine. 
Privileged instructions are available to user space tasks via
<emphasis>system calls<indexterm>
<primary>system call</primary></indexterm></emphasis>, that are in
fact handlers of software interrupts: a system call puts some data
in registers or on the stack, and then executes a software interrupt,
which makes the processor switch to protected mode and run the
interrupt handler. That handler can use the register or stack data for
task specific execution. Recall that the handler of a software
interrupt runs in the context of the task that executes the system
call, so it can read the data that the task has put on the stack. But
now the execution takes place in the protected mode of the processor.
</para>
<para>
A system call is just one example of a software interrupt, or trap.
An interrupt service routine of a trap is
often called a <emphasis>trap handler<indexterm>
<primary>trap handler</primary></indexterm>
<indexterm>
 <primary>handler</primary><secondary>trap</secondary>
</indexterm></emphasis>.
Still another name for a software interrupt is a
<emphasis>service request<indexterm>
<primary>service request</primary></indexterm>
(SRQ)<indexterm>
 <primary>SRQ</primary><secondary>service request</secondary>
</indexterm></emphasis>.
Each type of &cpu; has part of its interrupt vector reserved for these
trap handlers.  Operating systems typically have a default trap handler
installed, which they attach to all possible software interrupts in
your system. Usually, you can replace any of these by your own. For
example, &rtai; has a <function>rt_set_rtai_trap_handler()</function>
for this purpose. The OS also reserves a number of traps as 
<emphasis>system signals<indexterm>
<primary>signal</primary></indexterm></emphasis>. For example, &rtai;
reserves 32 signals, most of them correspond to what standard &linux;
uses.
</para>
<para>
Trap handlers are a major tool in <emphasis>debugging</emphasis>:
compiling your code with the debug option turned on results, among
other things, in the introduction in (the compiled version of) your
original code of machine instructions that generate a trap after each
line in your code. The &isr; triggered by that trap can then inform
the debug task about which &ldquo;breakpoint&rdquo; in your program
was reached, thanks to the register information that the trap has
filled in (<xref linkend="sect-inter-sw">).
</para>
<para>
Another major application of the trap functionality, certainly in the
context of this document, is their use by &rtai; to deliver user space
hard real-time functionality (<xref linkend="rtai-lxrt">).
&linux; just uses one single software interrupt, at address
<parameter>0x80</parameter>, for its user space system calls, leaving
a lot of software interrupts to applications such as &rtai;.
</para>
 </listitem>

 <listitem>
 <para>
<emphasis>Interrupt latency.</emphasis><indexterm>
<primary>interrupt latency</primary></indexterm>
<indexterm>
 <primary>latency</primary><secondary>interrupt</secondary>
</indexterm>
This is the time between the arrival of the hardware interrupt and the
start of the execution of the corresponding &isr;.
The latency is not a crisp number, but rather a statistical quantity
becaused it is influenced by a large number of undeterministic
effects (<xref linkend="latency">). 
This becomes more and more the case in modern processors with
their multiple levels of caches and instruction pipelines, that all
might need to be reset before the &isr; can start. This latter fact is
at the origin of the somewhat counter-intuitive phenomenon that some
modern Gigahertz &cpu;s have longer interrupt latencies than much
older digital signal processors.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Interrupt enable/disable.</emphasis> Each processor has atomic
operations to enable or disable (&ldquo;<emphasis>mask</emphasis>&rdquo;)
the interrupts. Common names for these functions are
<function>sti()</function> (&ldquo;set interrupt enable flag&rdquo;,
i.e., enable interrupts to come through to interrupt the &cpu;) and
<function>cli()</function> (&ldquo;clear interrupt enable flag&rdquo;,
i.e., don't allow interrupts).
In the same context, one finds functions like 
<function>save_flags()</function> and <function>restore_flags()</function>.
These are a bit more fine-grained than <function>sti()</function> and
<function>cli()</function>, in the sense that they save/restore a
bit-sequence where each bit corresponds to an hardware interrupt line
and indicates whether or not the interrupt on that particular line
should be enabled. In other words, it saves the &ldquo;state&rdquo; of
the interrupt vector.
(<function>restore_flags()</function> in some cases does an implicit
enabling of the interrupts too.)
Note that <function>cli()</function> disables <emphasis>all</emphasis>
interrupts on <emphasis>all</emphasis> processors in an &smp; system.
That is a costly approach to use, particularly so in an &rtos;. 
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Interrupt priorities.</emphasis>
Some systems offer, as a <emphasis>hardware feature</emphasis>,
<emphasis>(static) priorities<indexterm>
<primary>static priority</primary></indexterm>
<indexterm><primary>interrupt</primary><secondary>priority</secondary>
</indexterm>
</emphasis>
to interrupts. That means that the OS blocks a new interrupt if an
&isr; of an interrupt with a higher priority is still running. (Or
rather, as long has it has not enabled the interrupts again.)
Similarly, the &isr; of a lower-priority interrupt is pre-empted when
a higher-priority interrupt comes in. Hence, &isr;s must be 
<emphasis>re-entrant.<indexterm>
<primary>re-entrant</primary></indexterm>
<indexterm>
 <primary>&isr;</primary><secondary>re-entrant</secondary>
</indexterm>
</emphasis>
And, if the processor allows interrupt priorities, most
opportunities/problems that are known in task scheduling
(see <xref linkend="sched-prob">) show up in the interrupt handling
too!
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Prioritized interrupt disable.<indexterm>
<primary>prioritized interrupt disable</primary>
</indexterm></emphasis>
<indexterm>
 <primary>interrupt</primary><secondary>prioritized disable</secondary>
</indexterm>
Prioritized enabling/disabling of the interrupts is a
<emphasis>software feature</emphasis> (that must have hardware
support, of course) that allows the programmer to
disable interrupts below a specified <emphasis>priority level</emphasis>.
&nt; is an example of an OS kernel that extensively uses this feature.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Interrupt nesting.<indexterm>
<primary>interrupt nesting</primary></indexterm></emphasis>
If the processor and/or operating
system allow interrupt nesting, then an &isr; servicing one interrupt
can itself be pre-empted by another interrupt (which could come from
the same peripheral device that is now being serviced!). Interrupt
nesting increases code complexity, because &isr;s must use
<emphasis>re-entrant</emphasis> code only, i.e., the &isr; must be written in
such a way that it is robust against being pre-empted at any time.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Interrupt sharing.</emphasis><indexterm>
<primary>interrupt sharing</primary></indexterm>
<indexterm>
 <primary>interrupt</primary><secondary>sharing</secondary>
</indexterm>
Many systems allow different peripheral devices to be linked to the
same hardware interrupt. The &isr; servicing this interrupt must then
be able to find out which device generated the interrupt. It does this 
by (i) checking a status register on each of the devices that share the
interrupt, or (ii) calling in turn all &isr;s that users have
registered with this &irq;.
</para>

<para>
Interrupt sharing is implemented in most general purpose operating
systems, hence also in the &linux; kernel. 
(See the file <filename>/kernel/softirq.c</filename>.)
&linux; accepts multiple interrupt handlers on the same interrupt
number.  The kernel hangs its own &isr; on the hardware interrupt, and
that kernel &isr; invokes one by one all the handler routines of the
&isr;s that have been registered by the application programs. This
means that they will be executed <emphasis>after</emphasis> the
hardware &isr; has finished, but <emphasis>before</emphasis> any other
tasks, and with interrupts enabled. 
</para>
<para>
While the &linux; kernel does interrupt sharing as mentioned in the
previous paragraph, &rtlinux; and &rtai; don't: they allow only one
single &isr; per &irq;, in order to be as deterministic as possible.
(So, be careful when putting interface cards in your computer, because
all the ones for which you want to install
<emphasis>real-time</emphasis> drivers must be connected to different
interrupt lines!)
The real-time &isr; that a user program has registered is directly
linked to the hardware interrupt, and hence runs with all interrupts
disabled.
The other &isr;s on that same &irq; are
only executed when the non-real-time &linux; kernel on top of the
&rtos; gets the occasion to run, i.e., after <emphasis>all</emphasis>
real-time activity is done, also non-&isr; activity.
</para>
 </listitem>

</itemizedlist>
</para>

</sect1>


<sect1 id="sect-idsr">
<title>&isr;, &dsr; and &asr;</title>
<para>
An &isr; should be as short as possible, in order to minimize the
delay of interrupts to other &isr;s, and the scheduling of tasks.
In general-purpose operating systems, only the &isr; that the OS has
attached to each &irq; runs with interrupts disabled, but not the
user-registered &isr;s. A real real-time operating system, on the
other hand, allows only one &isr; per &irq;, otherwise, the time
determinism of the other &isr;s is not guaranteed!  This makes the job
for real-time programmers a bit easier, because they can design
<emphasis>non-re-entrant</emphasis> (and hence often simpler and
faster) &isr;s: the &isr; can store local information without the
danger that it can be overwritten by another invocation of the
<emphasis>same</emphasis> &isr; code, and it has the guarantee of
<emphasis>atomicity</emphasis>, (i.e., the &isr; will run without being
pre-empted. However, when the OS and the hardware allow interrupt
priorities, the &isr; at one &irq; level <emphasis>can</emphasis> be
pre-empted by a higher-priority interrupt.
</para>

<para>
Typically, a hardware &isr; just reads or writes the data involved in
the communication with the peripheral device or the trap that caused
the interrupt, acknowledges the interrupt if the peripheral device
requires it, and then, if needed, wakes up another &ldquo;task&rdquo;
to do any further processing. For example, most drivers for network
cards just transfer the raw packet data to or from the card in the
&isr;, and delegate all <emphasis>interpretation</emphasis> of the
data to another task. The skeleton of a typical &isr;-&dsr;
combination would look like this:
<programlisting>
<![CDATA[
dsr_thread()
{
   while (1) {
      wait_for_signal_from_isr();
      process_data_of_ISR (); // including all blocking stuff
   }
}

interrupt_handler( )
{
   reset_hardware();
   do_isr_stuff();
   send_signal_to_wake_up_dsr();
   re_enable_interrupts() // some RTOSs do this automatically
}
]]>
</programlisting>
</para>

<para>
In the &linux; kernel, this latter task used to be a
<emphasis>bottom half,</emphasis><indexterm>
<primary>bottom half</primary></indexterm>
while the hardware interrupt-driven &isr; was called the
<emphasis>top half</emphasis>.<indexterm>
<primary>top half</primary></indexterm>
(Note that some operating systems use opposite terminology.)
The bottom half concept is more or less abandoned, and replaced by
<emphasis>tasklets</emphasis><indexterm>
<primary>tasklet</primary></indexterm>
and
<emphasis>softirqs</emphasis><indexterm>
<primary>softirq</primary></indexterm> (see the files 
<filename class=headerfile>include/linux/interrupt.h</filename> and
<filename>kernel/softirq.c</filename>). The reason for abandoning the
bottom halves is that &linux; has a hard limit of maximum 32 bottom
halves functions. Moreover, they run with locks over the
<emphasis>whole</emphasis> system, which is not very good for
multi-processor systems. The softirq was introduced in the 2.3.43
kernel, as a multi-processor-aware version of the bottom half; there
are still only 32 of them, so application programmers should stay away
from them, and use <emphasis>tasklets<indexterm>
<primary>tasklet</primary></indexterm></emphasis> instead.
</para>

<para>
(Tasklet are a very appropriate primitive in the context of interrupt
servicing, but its usefulness is in no way limited to only this
context!)
</para>

<para>
&ldquo;Tasklet&rdquo; is a bit of an unfortunate name, because it has
not much to do with schedulable tasks: a tasklet is a
<emphasis>function</emphasis> that the kernel calls when an &isr; 
has requested a ``follow-up'' of its interrupt servicing.
Outside of the &linux; world, this follow-up function is more often
called &dsr;,<indexterm>
<primary>&dsr;</primary></indexterm>
<indexterm>
 <primary>interrupt</primary><secondary>&dsr;</secondary>
</indexterm>
<emphasis>Deferred Service Routine</emphasis>,<indexterm>
<primary>Deferred Service Routine</primary></indexterm>
<indexterm>
 <primary>interrupt</primary>
 <secondary>Deferred Service Routine</secondary>
</indexterm>
or (in &nt;),
<emphasis>Deferred Processing Call</emphasis>.
In &linux;, an unlimited number of tasklets is allowed, and they have
the same behaviour and functionality as the
<emphasis>softirqs</emphasis>.
</para>
<para>
In &linux; the &isr; requests the execution of a tasklet/&dsr; via the
<function>
<![CDATA[
tasklet_schedule(&tasklet)
]]>
</function> command.
The tasklet has first to be initialized with a
<function>
<![CDATA[
tasklet_init (&tasklet, tasklet_function, data)
]]>
</function>;
this call links a tasklet identifier with the function to be executed
and a data structure in which the &isr; can store information for
processing by the tasklet.  The tasklet (or &dsr;, or softirq) runs
with interrupts <emphasis>enabled</emphasis>, but outside of the
context<indexterm><primary>context</primary></indexterm>
of a particular task, just as the &isr; that has requested it.  This
means that neither the &isr;, nor the &dsr; can use variables that
have been defined locally in the scope of the task(s) to which they
logically are related.  The execution of tasklets is implemented in
the <filename>kernel/softirq.c</filename> file, and both tasklets and
softirqs are treated as <parameter>softirq</parameter> tasks.
</para>
<para>
&linux; (and many other general purpose operating systems) executes
the &dsr;s in sequence (without mutual pre-emption, that is), at the
end of hardware &isr;s and <emphasis>before</emphasis> the kernel
returns to user space. So, at the end of each kernel call, the
scheduler checks whether some &dsr;s are ready to be executed; see the
file <filename>kernel/softirq.c</filename> in the &linux; source code.
</para>

<para>
&rtai; also has tasklets, and their semantics is more or less like the
&linux; tasklets. However, &rtai; added some extra features
(see the files <filename>include/rtai_tasklets.h</filename> and
<filename>tasklets/tasklets.c</filename>):
<itemizedlist>
<listitem>
<para>
There is a special class of tasklets, called
<emphasis>timers<indexterm>
<primary>timers</primary></indexterm></emphasis>. They can be used to
let context-independent functions run with specified timings.
</para>
</listitem>

<listitem>
<para>
&rtai; also allows a user space function to be executed as a tasklet.
</para>
</listitem>

</itemizedlist>
The &rtai; tasklets are executed by a dedicated task and/or &isr; in
the &rtai; kernel.
</para>

<para>
An &isr; is not allowed to use semaphores or any other potentially
<emphasis>blocking</emphasis> system calls: an &isr; that blocks on
a lock held by another task causes big trouble, because all interrupts
are disabled when the &isr; runs, such that the condition to wake up
the other task might never occur. The same holds for &rtai; tasklets:
a blocking tasklet also blocks the timer &isr; or task that executes
all tasklets.
</para>

<para>
Avoiding non-blocking calls is sufficient for maximum determinism in a
&up; (&ldquo;uni-processor&rdquo;) system. In a multi-processor
system, however, a race condition (see <xref linkend="sect-race">) can
occur between the hardware &isr; on one processor, and any other task
on one of the other processors; e.g., because the &isr; and the other
task access shared data. (Remember that the &isr; cannot use a lock!)
The easiest solution is to not only mask the interrupts for one
processor, but for all of them. This, however, prevents
<emphasis>all</emphasis> processors from working. One way around this
are <emphasis>spinlocks</emphasis>
(see <xref linkend="sect-spinlock">). The operating system also helps
a bit, by guaranteeing that tasklets are
<emphasis>serialized<indexterm>
<primary>serialization</primary></indexterm>
over all processors</emphasis> in the system; i.e., only one is
executed at a time.
</para>

<para>
Some operating systems have one more level of interrupt sharing:
besides the &isr; and &dsr; functions, they offer the possibility to
use <emphasis>Asynchronous Service Routines<indexterm>
<primary>Asynchronous Service Routine</primary></indexterm>
 (&asr;<indexterm><primary>&asr;</primary></indexterm>)</emphasis>.
(This name is not as standardized as &isr; and &dsr;.) In &nt;, it is
called an <emphasis>Asynchronous Procedure Call</emphasis>; &ecos;
calls it &dsr;; &linux; doesn't have the concept.
&asr;s can run after all &dsr;s have finished, but before normal tasks
get a chance to be scheduled.  Their goal is to execute that part of
the reaction to an interrupt, that needs the thread's context; for
example, to make the thread stop some of its activities, including
itself.
</para>
<para>
The &ecos; operating system executes an &asr; with interrupts
enabled, with the scheduler disabled, and always in the context of one
specific thread. So, the &asr; can call all system functions, which is
not the case for &isr; and &dsr;, which are not bound to a
deterministically defined context.
</para>
<para>
The &rtai; operating system gives the possibility to add to each
real-time task a user-defined function that runs in the task's context
and with interrupts disabled, <emphasis>every</emphasis> time that the
task gets scheduled (hence, not just when an interrupt has occurred).
This allows, for example, an interrupt servicing to indirectly change
some task-specific attributes at each scheduling instant.  This
user-defined function is called
&ldquo;<function>signal()</function>&rdquo; and is filled in 
by <function>rt_task_init()</function> (XXX ???) in the task data
structure.  However, it's just a pointer to a function, so it could be
filled in or changed on-line.
</para>

</sect1>

</chapter>



<chapter id="ipc-synch">
<title>IPC: synchronization</title>

<para>
The decision about what code to run next is made by the operating system
(i.e., its scheduler), or by the hardware interrupts that force the processor
to jump to an associated interrupt routine. To the scheduler of the OS, all
tasks are just &ldquo;numbers&rdquo; in scheduling queues; and interrupts
&ldquo;talk&rdquo; to their own interrupt service routine only. So, scheduler
and interrupts would be sufficient organizational structure in a system where
all tasks just live next to each other, without need for cooperation. This, of
course, is not sufficient for many applications. For example, an interrupt
service routine collects measurements from a peripheral device, this data is
processed by a dedicated control task, the results are sent out via another
peripheral device to an actuator, and displayed for the user by still another
task. 
</para>
<para>
Hence, the need exists for <emphasis>synchronization</emphasis> of different
tasks (What is the correct sequence and timing to execute the different
tasks?), as well as for <emphasis>data exchange</emphasis> between them.
Synchronization and data exchange are complementary concepts, because the
usefulness of exchanged data often depends on the correct
synchronization of all tasks involved in the exchange.
Both concepts are collectively referred to as
<emphasis>Interprocess communication</emphasis> (&ldquo;&ipc;&rdquo;).
</para>
<para>
The role of the operating system in matters of &ipc; is to offer a
sufficiently rich set of &ipc;-supporting primitives. These should
allow the tasks to engage in &ipc; without having to bother with the
details of their implementation and with hardware dependence. This is
not a minor achievement of the operating system developers, because
making these &ipc; primitives safe and easy to use requires a lot of
care and insight. In any case, the current state-of-the-art in
operating systems' &ipc; support is such that they still don't offer
much more than just <emphasis>primitives</emphasis>. Hence, 
programmers have to know how to apply these primitives appropriately
when building software systems consisting of multiple concurrent
tasks; this often remains a difficult because error-prone design and
implementation job. Not in the least because no 
<emphasis>one-size-fits-all</emphasis> solution can exist for all
application needs.
</para>


<sect1 id="sect-ipc-terminology">
<title>IPC terminology</title>
<para>
The general <emphasis>synchronization</emphasis> and
<emphasis>data exchange</emphasis> problems involve (at least) two
tasks, which we will call the &ldquo;sender&rdquo; and the
&ldquo;receiver&rdquo;.  (These tasks are often also called
&ldquo;writer&rdquo; and &ldquo;reader&rdquo;, or
&ldquo;producer&rdquo; and &ldquo;consumer&rdquo;.)
For <emphasis>synchronization</emphasis>,
&ldquo;sender&rdquo; and &ldquo;receiver&rdquo; want to make sure they
are both in (or <emphasis>not</emphasis> in) specified parts of their
code at the same time.  
For <emphasis>data exchange</emphasis>, &ldquo;sender&rdquo; and
&ldquo;receiver&rdquo; want to make sure they can exchange data
efficiently, without having to know too much of each other
(&ldquo;decoupling&rdquo;, <xref linkend="chap-design">), and
according to several different <emphasis>policies</emphasis>,
such as blocking/non-blocking, or with/without data loss.
</para>

<para>
Data exchange has a natural direction of flow,
and, hence, the terminology &ldquo;sender&rdquo; and
&ldquo;receiver&rdquo; is appropriate. Synchronization is often
without natural order or direction of flow, and, hence, the
terminology &ldquo;sender&rdquo; and &ldquo;receiver&rdquo; is less
appropriate in this context, and &ldquo;(IPC) client&rdquo; might be
a more appropriate because symmetric terminology. Anyway, the exact
terminology doesn't matter too much. Unless we want to be more
specific, we will use the generic system calls
<function>send()</function> and <function>receive()</function> to
indicate the &ipc; primitives used by sender and receiver,
respectively. 
</para>

<sect2 id="blocking">
<title>Blocking/Non-blocking </title>
<para>
&ipc; primitives can have different effects on <emphasis>task
scheduling</emphasis>:
<itemizedlist>
 <listitem>
 <para>
<emphasis>Blocking.</emphasis> When executing the
<function>send()</function> part of the &ipc;, the sender task is blocked
(i.e., non-available for scheduling) until the receiver has
accepted the &ipc; in a <function>receive()</function> call. And similarly
the other way around. If both the sender and the receiver block until
<emphasis>both of them</emphasis> are in their
<function>send()</function>
and <function>receive()</function> commands, the &ipc; is called
<emphasis>synchronous</emphasis>. (Other names are: 
<emphasis>rendez-vous</emphasis>, or <emphasis>handshake</emphasis>.)
Synchronous &ipc; is the easiest to design with, and is very similar to
building hardware systems.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Non-blocking (asynchronous).</emphasis> Sender and receiver are not
blocked in their &ipc; commands. This means that there is incomplete
synchronization: the sender doesn't know when the receiver will get its
message, and the receiver cannot be sure the sender is still in the same state
as when it sent the message.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Blocking with time out.</emphasis> The tasks wait in their 
&ipc; commands for at most a specified maximum amount of time.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Conditional blocking.</emphasis> The tasks block in their 
&ipc; commands only if a certain condition is fulfilled.
 </para>
 </listitem>

</itemizedlist>
Of course, blocking primitives should be used with care in real-time sections
of a software system.
</para>

</sect2>

<sect2 id="ipc-coupling">
<title>Coupling</title>

<para>
&ipc; primitives can use different degrees of <emphasis>coupling</emphasis>:
<itemizedlist>

 <listitem>
 <para>
 <emphasis>Named connection</emphasis>: sender and receiver know about
each other, and can <emphasis>call each other by name</emphasis>. That
means that the sender fills in the unique identifier of the receiver in
its <function>send()</function> command, and vice versa. This can set up a
connection between both tasks, in a way very similar to the telephone
system, where one has to dial the number of the person one wants to
talk to.
 </para>

 <para>
The connection can be <emphasis>one-to-one</emphasis>, or
<emphasis>one-to-many</emphasis> (i.e., the single sender sends to more
than one receiver, such as for broadcasting to a set of named correspondents),
or <emphasis>many-to-one</emphasis> (for example, many tasks send logging
commands to an activity logging task), or <emphasis>many-to-many</emphasis>
(for example, video conferencing).
 </para>
 </listitem>

 <listitem>
 <para>
 <emphasis>Broadcast</emphasis>: the sender sends its message to all
&ldquo;listeners&rdquo; (without explicitly calling them by name) on
the (sub-branch of the) <emphasis>network</emphasis> to which it is
connected. The listeners receive the message if they want, without the
sender knowing exactly which tasks have really used its message.
 </para>
 </listitem>

 <listitem>
 <para>
 <emphasis>Blackboard</emphasis>: while a broadcast is a message on a
network-like medium (i.e., the message is not stored in the network
for later use), a blackboard &ipc; <emphasis>stores</emphasis> the
messages from different senders. So, receivers can look at them at any
later time.
 </para>
 </listitem>

 <listitem>
 <para>
 <emphasis>Object request broker</emphasis> (&orb;): the previous
types of &ipc; all imply a rather high level of
<emphasis>coupling</emphasis> between sender and receiver, in the
sense that they have to know explicitly the identity of their
communication partner, of the network branch, or of the blackboard.
The current trend towards more
<emphasis>distributed</emphasis> and <emphasis>dynamically
reconfigurable</emphasis> computer systems calls for more
<emphasis>loosely-coupled</emphasis> forms of &ipc;. The &orb; concept
has been developed to cover these needs: a sender
<emphasis>component</emphasis> registers its
<emphasis>interface</emphasis> with the &orb;; interested
receivers can ask the broker to forward their requests to an
appropriate sender (&ldquo;server&rdquo;)  component, without the need
to know its identity, nor its address.
 </para>
 </listitem>

</itemizedlist>
</para>

</sect2>

<sect2 id="ipc-buffering">
<title>Buffering</title>

<para>
&ipc; primitives can use different degrees of
<emphasis>buffering</emphasis>, ranging from the case where the
operating system stores and delivers all messages, to the case where
the message is lost if the receiver is not ready to receive it.
</para>

<para>
Not all of the above-mentioned forms of &ipc; are equally appropriate
for <emphasis>real-time</emphasis> use, because some imply too
much and/or too indeterministic overhead for communication and resource
allocation.
</para>

</sect2>

</sect1>


<sect1 id="sect-race">
<title>Race conditions and critical sections</title>
<para>
<indexterm><primary>race condition</primary></indexterm>
<indexterm><primary>critical section</primary></indexterm>
Often, two or more tasks need access to the same data or device, for
writing and/or reading.  The origin of most problems with
<emphasis>resource sharing<indexterm>
<primary>resource sharing</primary></indexterm></emphasis>
(or <emphasis>resource allocation<indexterm>
<primary>resource allocation</primary></indexterm></emphasis>)
in multi-tasking and multi-processor systems is
the fact that operations on resources can usually not be performed
<emphasis>atomically</emphasis>, i.e., as if they were executed as one
single, non-interruptable instruction that takes zero time. Indeed, a
task that interfaces with a resource can at any instant be pre-empted,
and hence, when it gets re-scheduled again, it cannot just take for
granted that the data it uses now is in the same state (or at least, a
state that is consistent with the state) before the pre-emption.
Consider the following situation:
<programlisting>
<![CDATA[
data number_1;
data number_2;

task A
{   data A_number;

    A_number = read(number_1);
    A_number = A_number + 1;
    write(number_2,A_number);
}

task B
{  if ( read(number_1) == read(number_2) )
       do_something();
    else
       do_something_else();
    }
}
]]>
</programlisting>
<function>task B</function> takes different actions based on the
(non-)equality of <parameter>number_1</parameter> and
<parameter>number_2</parameter>. But <function>task B</function> can
be pre-empted in its <function>if</function> statement by
<function>task A</function>, exactly at the moment that <function>task
B</function> has already read <parameter>number_1</parameter>, but not
yet <parameter>number_2</parameter>.  This means that it has read
<parameter>number_1</parameter> before the pre-emption, and
<parameter>number_2</parameter> after the pre-emption, which violates
the validity of the test.
</para>

<para>
The <function>if</function> statement is one example of a so-called
<emphasis>critical section<indexterm>
<primary>critical section</primary></indexterm></emphasis>:
it is critical to the validity of the code that the
<emphasis>access to the data</emphasis> used in that statement
(i.e., <parameter>number_1</parameter> and
<parameter>number_2</parameter>) be executed
<emphasis>atomically</emphasis>, i.e., un-interruptable by anything
else. (Most) <emphasis>machine code</emphasis> instructions of a given
processor execute atomically; but instructions in higher-level
programming languages are usually translated into a sequence of many
machine code instructions, such that atomicity cannot be guaranteed.
</para>

<para>
There are three generic types of critical sections:
<itemizedlist>

 <listitem>
 <para><emphasis>Access to the same data from different tasks,</emphasis>
as illustrated by the example above.
 </para>
 </listitem>

 <listitem>
 <para><emphasis>Access to a service.</emphasis>
For example, allocation of a resource, execution of a
&ldquo;transaction&rdquo; on a database. The service typically has to
process a sequence of queries, and these have to succeed as a whole,
or fail as a whole.
</para>
 </listitem>

 <listitem>
 <para><emphasis>Access to procedure code.</emphasis>
Application tasks often run exactly the same code (for example the control
algorithm in each of the joints of a robot), but on other data, and
some parts of that code should be executed by one task at a time only.
 </para>
 </listitem>
</itemizedlist>
Of course, many applications involve combinations of different resource
sharing needs.
</para>

<para>
The problem in all above-mentioned examples of access to shared resources is
often called a <emphasis>race condition<indexterm>
<primary>race condition</primary></indexterm></emphasis>:
two or more tasks compete (&ldquo;race&rdquo;) against each other to
get access to the shared resources. Some of these race conditions
have been given a special name:
<itemizedlist>

 <listitem>
 <para>
<emphasis>Deadlock.<indexterm>
<primary>deadlock</primary></indexterm></emphasis>
<function>Task A</function> has locked a resource and is blocked
waiting for a resource that is locked by <function>task B</function>,
while <function>task B</function> is blocked waiting for the resource
that is locked by <function>task A</function>.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Livelock.<indexterm>
<primary>livelock</primary></indexterm></emphasis>
This situation is similar to the deadlock, with this difference: both
tasks are not blocked but are actively trying to get the resource, in
some form of <emphasis>busy waiting</emphasis>.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Starvation.<indexterm>
<primary>starvation</primary></indexterm></emphasis>
In this situation, some tasks never get the chance to allocate the resource
they require, because other tasks always get priority.
 </para>
 </listitem>
</itemizedlist>
The four conditions that have to be satisfied in order to
(potentially!) give rise to a deadlock are:
<orderedlist>
 <listitem>
 <para>
Locks are only <emphasis>released voluntarily</emphasis> by tasks. So,
a task that needs two locks might obtain the first lock, but block on
the second one, so that it is not able anymore to voluntarily release
the first lock.
 </para>
 </listitem>

 <listitem>
 <para>
Tasks can only get in a deadlock if they need
<emphasis>more than one lock</emphasis>, and have to obtain them in a
(non-atomic) <emphasis>sequential</emphasis> order.
 </para>
 </listitem>

 <listitem>
 <para>
The resources guarded by locks can only be
<emphasis>allocated to one single task</emphasis>.
(Or to a finite number of tasks.)
 </para>
 </listitem>

 <listitem>
 <para>
Tasks try to obtain locks that other tasks have already obtained, and
these tasks form a <emphasis>circular list</emphasis>. For example,
<function>task A</function> is waiting for <function>task B</function>
to release a lock, <function>task B</function> is waiting for
<function>task C</function> to release a lock, and
<function>task C</function> is waiting for <function>task A</function>.
 </para>
 </listitem>

</orderedlist>
As soon as <emphasis>one</emphasis> of these four conditions is not
satisfied, a deadlock can not occur. Moreover, these conditions are
<emphasis>not sufficient</emphasis> for deadlocks to occur: they just
describe the conditions under which it is
<emphasis>possible</emphasis> to have deadlocks.
</para>

<para>
The literature contains many examples of deadlock
<emphasis>avoidance<indexterm>
<primary>deadlock avoidance</primary></indexterm>
<indexterm>
 <primary>deadlock</primary><secondary>avoidance</secondary>
</indexterm></emphasis> and <emphasis>prevention<indexterm>
<primary>deadlock prevention</primary></indexterm>
<indexterm>
 <primary>deadlock</primary><secondary>prevention</secondary>
</indexterm></emphasis>
algorithms. Deadlock avoidance makes sure that all four necessary
conditions are never satisfied at the same time; deadlock prevention
allows the possibility for a deadlock to occur, but makes sure that
this possibility is never realized. Both kinds of algorithms, however,
often require some form of &ldquo;global&rdquo; knowledge about the
states of all tasks in the system. Hence, they are too indeterministic
for real-time execution, and not suitable for
<emphasis>component-based</emphasis> design (because the requirement
for global knowledge is in contradiction with the
<emphasis>loose coupling</emphasis> strived for in component systems
(see <xref linkend="chap-design">).
</para>
<para>
There are some guaranteed deadlock avoidance algorithms, that are
reasonably simple to implement. For example, a deadlock cannot occur
if <emphasis>all programs</emphasis> always take locks in the same
order. This requires a globally known and ordered lists of locks, and
coding discipline from the programmers. Other prevention algorithms
use some of the following approaches: only allow each task to hold one
resource; pre-allocate resources; force release of a resource before a
new request can be made; ordering all tasks and give them priority
according to that order.
</para>

<para>
Race conditions occur on a single processor system because of its
multi-tasking and interrupt functionalities. But they show up even
more on <emphasis>multi-processor systems</emphasis>: even if one &cpu;
is preventing the tasks that it runs from accessing a resource
concurrently, a task on another &cpu; might interfere.
</para>

</sect1>


<sect1 id="sect-signal">
<title>Signals</title>
<para>
<emphasis>Signals<indexterm>
<primary>signal</primary></indexterm></emphasis> 
are one of the &ipc; synchronization primitives used for
<emphasis>asynchronous notification<indexterm>
<primary>asynchronous notification</primary></indexterm><indexterm>
<primary>notification</primary></indexterm></emphasis>: one task fires
a signal, which <emphasis>can</emphasis> cause other tasks to start
doing thins. The emphasis is on &ldquo;asynchronous&rdquo; and on
&ldquo;can&rdquo;:
<itemizedlist>

<listitem>
<para>
<emphasis>Asynchronous<indexterm>
<primary>asynchronous</primary></indexterm></emphasis>: the tasks that
react to signals are in a completely arbitrary state, unrelated with
the signaling task. Their reaction to the signal also need not be
instantaneous, or synchronized, with the signaling task. The task that
sends the signal, and the tasks that use the signal, need not share
any memory, as in the case of semaphores or mutexes. This makes
signals about the only synchronization primitive that is
straightforward to scale over a <emphasis>network</emphasis>.
</para>
</listitem>

<listitem>
<para>
<emphasis>Can</emphasis>: the signaling task fires a signal, and
continues with its job. Whether or not other tasks do something with
its signal is not of its concerns. The operating system takes care of
the delivery of the signal, and it nobody wants it, it is just lost.
</para>
</listitem>

</itemizedlist>
In most operating systems, signals
 <itemizedlist>
  <listitem>
  <para>
    are <emphasis>not queued</emphasis>. A task's signal handler has
no means to detect whether it has been signaled more than once.
  </para>
  </listitem>
  <listitem>
  <para>
    <emphasis>carry no data</emphasis>.
  </para>
  </listitem>
  <listitem>
  <para>
    <emphasis>have no deterministic delivery time or order</emphasis>.
A task that gets signaled is not necessarily scheduled immediately.
  </para>
  </listitem>
  <listitem>
  <para>
    <emphasis>have no deterministic order</emphasis>.
A task that gets signaled multiple times has no way to find out in which
temporal order the signals were sent.
  </para>
  </listitem>
 </itemizedlist>
So, these are reasons to avoid signals for
<emphasis>synchronization between two running tasks</emphasis>, 
<citation>BrinchHansen73</citation>. In other words:
<emphasis>notification</emphasis> in itself is not sufficient for
<emphasis>synchronization<indexterm>
<primary>synchronization</primary></indexterm></emphasis>.
Synchronization needs two tasks that do something together, while
taking notice of each other, and respecting each other's activities.
Later sections of the text present &ipc; primitives that are better
suited for synchronization than signals.
</para>
<para>
&posix; has standardized signals and their connection to threads. The
OS offers a number of pre-defined signals (such as
&ldquo;kill&rdquo;), and task can ask the operating system to connect
a handler (i.e., a function) to a particular signal on its behalf. The
handler is &ldquo;registered&rdquo;, using the system call
<function>sigaction()</function>. The task also asks the OS to receive
or block a specific subset of all available signals; this is its
&ldquo;signal mask&rdquo;. Whenever a signal is 
received by the operating system, it executes the registered handlers
of all tasks that have this signal in their mask. The task can also
issue a <function>sigwait(signal)</function>, which makes it sleep
until the <function>signal</function> is received; in this case, the
signal handler is <emphasis>not executed</emphasis>.
Anyway, signals are a bit difficult to work with, as illustrated by
this quote from the <function>signal</function> man page:
<blockquote>
<para>
For <function>sigwait</function> to work reliably, the signals being
waited for must be blocked in all threads, not only in the calling
thread, since otherwise the &posix; semantics for signal delivery do
not guarantee that it's the thread doing the
<function>sigwait</function> that will receive the signal. The best
way to achieve this is block those signals before any threads are
created, and never unblock them in the program other than by calling
<function>sigwait</function>.
</para>
</blockquote>
The masks are also set on a <emphasis>per-thread</emphasis> basis, but
the signal handlers are shared between all threads in a process.
Moreover, the implementation of signals tend to differ between 
operating systems, and the &posix; standard leaves room for
interpretation of its specification. For example, it doesn't say
anything about the <emphasis>order</emphasis> in which blocked threads
must be woken up by signals. So, these are reasons why many developers
don't use signals too much. 
</para>

<para>
&posix; has a specification for so-called &ldquo;real-time
signals&rdquo; too. Real-time signals are queued, they pass a 4-byte
data value to their associated signal handler, and they are guaranteed
to be delivered in numerical order, i.e., from lowest signal number to
highest. 
For example, &rtlinux; implements &posix; real-time signals,
and offers 32 different signal levels. (See the file
<filename>include/rtl_sched.h</filename> in the &rtlinux; source tree.)
And &rtai; also offers a 32 bit unsigned integer for events, but in a
little different way: the integer is used to allow signalling
<emphasis>multiple</emphasis> events: each bit in the integer is an
event, and a task can ask to be notified when a certain AND or OR
combination of these bits becomes valid. (See the file
<filename>include/rtai_bits.h</filename> in the &rtai; source tree.)
</para>

</sect1>

<sect1 id="exceptions">
<title>Exceptions</title>
<para>
<emphasis>Exceptions<indexterm><primary>exception</primary></indexterm>
</emphasis>
are signals that are sent
(&ldquo;raised&rdquo;) <emphasis>synchronously</emphasis>, i.e., by
the task that is currently running. (Recall that signals are
<emphasis>asynchronous</emphasis>, in the sense that a task can
receive a signal at any arbitrary moment in its lifetime.)
Exceptions are, roughly speaking, a signal from a task to itself.
As operating system primitive, an exception is a software interrupt
(see <xref linkend="sect-int-basics">) used to handle non-normal cases
in the execution of a task: numerical errors; devices that are not
reachable or deliver illegal messages; etc.  The software interrupt
gives rise to the execution of an exception handler, that the task (or
the operating system, or another task) registered previously.
In high-level programming languages, an exception need not be a
software interrupt, but it is a function call to the language's
<emphasis>runtime</emphasis> support, that will take care of the
exception handling.
</para>

</sect1>


<sect1 id="sect-atomic">
<title>Atomic operations</title>
<para>
The concept of an <emphasis>atomic operation</emphasis> is very
important in interprocess communication, because the operating system
must guarantee that the taking or releasing a lock is done without
interruption. That can only be the case if the
<emphasis>hardware</emphasis> offers some form of atomic operation on
bits or bytes.  Atomic operations come in various forms: in the
hardware, in the operating system, in a language's run-time, or in an
application's support library, but always, the hardware atomic
operation is at the bottom of the atomic service. This Section focuses
on the <emphasis>hardware</emphasis> support that is commonly
available.
</para>

<para>
Most processors offer an atomic machine instruction to
<emphasis>test a bit</emphasis> (or a byte or a word). In fact, the
operation not just <emphasis>tests</emphasis> the bit, but also
<emphasis>sets</emphasis> the bit if that bit has not already
been set.  Hence, the associated assembly instruction is often called
<function>test_and_set()</function>, or something similar. 
Expressed in pseudo-code, the <function>test_and_set()</function>
would look like this:
<programlisting>
int test_and_set(int *lock){
   int temp = *lock;
   *lock = 1;
   return temp; 
}
</programlisting>
</para>

<para>
Another atomic instruction offered by (a fewer number of) processors
is <function>compare_and_swap(address,old,new)</function>:<indexterm>
<primary><function>compare_and_swap</function></primary></indexterm>
it compares a value at a given memory address with an
&ldquo;old&rdquo; value given as parameter, and overwrites it with a
&ldquo;new&rdquo; value if the compared values are the same; in this
case, it returns &ldquo;true&rdquo;. If the values are not equal, the
new value is copied over the old value. Examples of processors with a
<function>compare_and_swap()</function>
are the <acronym>Alpha</acronym>, <acronym>ia32/ia64</acronym>,
<acronym>SPARC</acronym> and the <acronym>M68000/PowerPC</acronym>.
(Look in the &linux; source tree
for the <parameter>__HAVE_ARCH_CMPXCHG</parameter> macro to find
them.)
</para>

<para>
The <function>compare_and_swap()</function> operation is appropriate
for the implementation of the synchronization needed in, for example,
<emphasis>swinging pointers</emphasis>
(see <xref linkend="sect-lock-free">): in this case, the parameters of the
<function>compare_and_swap(address,old,new)</function> are the address of the
pointer and its old and new values.
</para>

<para>
The following pseudo-implementation is simplest to understand the
semantics of the <function>compare_and_swap()</function>:
<programlisting>
int compare_and_swap(address, old, new) {
  get_lock();
  if (*address == old) {
    *address == new;
    release_lock();
    return (1);
  } else {
    release_lock();
    return (0);
  };
</programlisting>
</para>

<para>
The <function>compare_and_swap()</function> can, however, be
implemented without locks, using the following pair 
of atomic instructions: <function>load_linked()</function> and
<function>store_conditional()</function>.
Together, they implement an atomic read-modify-write cycle.
The idea is that the
<function>load_linked()</function> instruction marks a memory location
as &ldquo;reserved&rdquo; (but does not lock it!) and if no processor
has tried to change the contents of that memory location when the
<function>store_conditional()</function> takes place, the store will
succeed, otherwise it will fail. If it fails, the calling task must
decide what to do next: retry, or do something else.
</para>

<para>
This pair of instructions can be used to implement
<function>compare_and_swap()</function> in an obvious way, and without
needing a lock:
<programlisting>
int compare_and_swap(address, old, new) {
  temp = load_linked(address);
  if (old == temp) return store_conditional(address,new);
  else return;
}
</programlisting>
The test <function>old == temp</function> need not take place in a
critical section, because both arguments are
<emphasis>local</emphasis> to this single task.
</para>

<para>
There are some <emphasis>important caveats</emphasis> with the
<function>compare_and_swap()</function> function: 
<itemizedlist>

 <listitem>
 <para>
It only compares the <emphasis>values</emphasis> at a given memory
location, but does not detect whether (or how many times) this value
has changed! That is: a memory location can be changed twice and have
its original value back.  To overcome this problem, a more extensive
atomic operation is needed, the
<function>double_word_compare_and_swap()</function>, which also checks
a <emphasis>tag</emphasis> attached to the pointer, and that
increments the tag at each change of the value of the pointer. This
operation is not very common in processors!
 </para>
 </listitem>

 <listitem>
 <para>
It is <emphasis>not multi-processor safe</emphasis>:
(TODO: why exactly?)
 </para>
 </listitem>

</itemizedlist>
</para>

<para>
While the hardware support for <emphasis>locks</emphasis> is
quite satisfactory, there is no support for
<emphasis>transaction rollback<indexterm>
<primary>transaction rollback</primary></indexterm>
</emphasis>.
Transaction rollback means that the software can
undo the effects of a sequence of actions, in such a way that the
complete sequence takes place as a whole, or else is undone without
leaving any trace. Transaction rollback is a quite advanced feature,
and not supported by operating systems; it's however a primary
component of high-end database servers.
</para>

</sect1>


<sect1 id="sect-locks">
<title>Semaphore, mutex, spinlock, read/write lock, barrier</title>
<para>
Race conditions can occur because the access to a shared resource is
not well synchronized between different tasks. One solution is to
allow tasks to get a
<emphasis>lock<indexterm>
<primary>lock</primary></indexterm></emphasis> on the resource.
The simplest way to lock is to disable all interrupts and disable the
scheduler when the task wants the resource. This is certainly quite
effective for the running task, but also quite drastic and far from
efficient for the activity of all other tasks. Hence, programmers
should not use these methods lightly if they want to maintain real
multi-tasking in the system.  So, this text focuses on locking
mechanisms that do <emphasis>not</emphasis> follow this drastic
approach. Basically, programmers can choose between two types of
locking primitives (see later sections for more details):
<orderedlist>
<listitem>
<para>
One based on <emphasis>busy waiting</emphasis>. This method has
overhead due to wasting &cpu; cycles in the busy waiting,
but it avoids the overhead due to bookkeeping of queues in which tasks
have to wait.
</para>
</listitem>
<listitem>
<para>
One based on the concept of a <emphasis>semaphore</emphasis>. This
method has no overhead of wasting &cpu; cycles, but it does have the
overhead of task queue bookkeeping and context switches.
</para>
</listitem>
</orderedlist>
A generic program that uses locks would look like this:
<programlisting>
data number_1;
data number_2;
lock lock_AB;

task A
{        data A_number;

         get_lock(lock_AB);
         A_number = read(number_1);
         A_number = A_number + 1;
         write(number_2,A_number);
         release_lock(lock_AB);
}

task B
{        get_lock(lock_AB);
         i = ( read(number_1) == read(number_2) );
         release_lock(lock_AB);
         if ( i )
                 do_something();
         else    do_something_else();
         }
}
</programlisting>
The <function>get_lock()</function>
and <function>release_lock()</function> function calls do not belong
to any specific programming language, library or standard. They have
just been invented for the purpose of illustration of the idea.
When either <function>task A</function> or
<function>task B</function> reaches its so-called
<emphasis>critical section</emphasis>, it requests the lock; it gets
the lock if the lock is not taken by the other task, and can enter the
critical section; otherwise, it waits (&ldquo;blocks&rdquo;,
&ldquo;sleeps&rdquo;) till the other task releases the lock at the end
of its critical section. A blocked task cannot be scheduled for
execution, so locks are to be used with care in real-time
applications: the application programmer should be sure about the
<emphasis>maximum</emphasis> amount of time that a task can be delayed
because of locks held by other tasks; and this maximum should be less
that specified by the timing constraints of the system.
</para>

<para>
The <function>get_lock()</function> should be executed
<emphasis>atomically</emphasis>, in order to avoid a race condition
when both tasks try to get the lock at the same time. (Indeed, the lock
is in this case an example of a shared resource, so locking is prone
to all race conditions involved in allocation of shared resources.)
The atomicity of getting a lock seems to be a vicious circle: one
needs a lock to guarantee atomicity of the execution of the function
that must give you a lock.
Of course, (only) the use of an atomic machine instruction can break
this circle. Operating systems implement the
<function>get_lock()</function> function by means of a atomic
<function>test_and_set()</function> machine instruction 
(see <xref linkend="sect-atomic">)
on a variable associated with the lock.
</para>

<para>
Another effective (but not necessarily efficient!) implementation of a lock is
as follows (borrowed from the &linux; kernel source code):
<programlisting>
int flags;

save_flags(flags);    // save the state of the interrupt vector
cli();                // disable interrupts
     // ... critical section ...
restore_flags(flags); // restore the interrupt vector to 
                      // its original state
sti();                // enable interrupts
</programlisting>
(Note that, in various implementations,
<function>restore_flags()</function> implicitly uses
<function>sti()</function>.)
</para>

<para>
The implementation described above is not always efficient because:
(i) in &smp; systems the <function>cli()</function> turns off
interrupts on <emphasis>all</emphasis> &cpu;s (see
<xref linkend="sect-int-basics">), and
(ii) if a <function>test_and_set()</function> can do the job, one
should use it, because the disabling of the interrupts and the saving
of the flags generate a lot of overhead.
</para>

<para>
The lock concept can easily lead to unpredictable latencies in the
scheduling of a task: the task can sleep while waiting for a lock to be
released; it doesn't have influence on how many locks other tasks are
using, how deep the locks are <emphasis>nested</emphasis>, or how
well-behaved other tasks use locks. <emphasis>Both</emphasis> tasks
involved in a synchronization using a lock have
(i) to agree about which lock they use to protect their common data (it
must be in their common address space!), (ii) to be disciplined enough
to release the lock, and (iii) to keep the critical
section as short as possible. Hence, the locks-based solution to
<emphasis>access or allocation constraints</emphasis> is equally
<emphasis>indirect and primitive</emphasis> as the priority-based
solution to <emphasis>timing constraints</emphasis>: it doesn't
protect the <emphasis>data</emphasis> directly, but synchronizes the
<emphasis>code</emphasis> that accesses the data. As with scheduling
priorities, locks give disciplined(!) programmers a means to reach
deterministic performance measures. But even discipline is not
sufficient to guarantee consistency in large-scale systems, where many
developers work more or less independently on different parts.
</para>

<para>
Locks are inevitable for task <emphasis>synchronization</emphasis>,
but for some common <emphasis>data exchange</emphasis> problems
there exist <emphasis>lock-free</emphasis> solutions
(see <xref linkend="sect-lock-free">).
The problem with using locks is that they make an application
vulnerable for the <emphasis>priority inversion</emphasis> problem
(see <xref linkend="sect-prior-inv">). Another problem occurs
when the &cpu; on which the task holding the lock is running,
suddenly fails, or when that task enters a trap and/or exception (see
<xref linkend="sect-int-basics">), because then the lock is not
released, or, at best its release is delayed.
</para>


<sect2 id="sect-semaphore">
<title>Semaphore</title>

<para>
The name &ldquo;semaphore<indexterm>
<primary>semaphore</primary></indexterm>&rdquo;
has its origin in the railroad world, where a it was
the (hardware) signal used to (dis)allow trains to access sections of
the track: when the semaphore was lowered, a train could proceed and
enter the track; when entering, the semaphore was raised, preventing
other trains from entering; when the train in the critical section
left that section, the semaphore was lowered again.
</para>

<para>
Edsger Dijkstra<indexterm><primary>Dijkstra</primary></indexterm>
introduced the semaphore concept in the context of
computing in 1965, <citation>Dijkstra65</citation>.
A semaphore is an <emphasis>integer number</emphasis> (initialized to
a positive value), together with a set of function calls
<emphasis>to count</emphasis>
<function>up()</function><indexterm>
<primary><function>up()</function></primary></indexterm>
and <function>down()</function>.<indexterm>
<primary><function>down()</function></primary></indexterm>
&posix; names for <function>up()</function> and
<function>down()</function> are <function>sem_wait()</function> and
<function>sem_signal()</function>. &posix; also introduces the
<emphasis>non-blocking</emphasis> functions
<function>sem_post()</function> (set the semaphore) and
<function>sem_trywait()</function> (same as
<function>sem_wait()</function> but instead of blocking, the state
of the semaphore is given in the function's return value).
</para>
<para>
A task that executes a <function>sem_wait()</function> blocks if the
count is zero or negative. The count is decremented when a task
executes a <function>sem_signal()</function>; if this makes the
semaphore value non-negative again, the semaphore unblocks one of the
tasks that were blocking on it.
</para>
<para>
So, the number of tasks that a semaphore allows to pass without
blocking is equal to the positive number with which it is initialized;
the number of blocked tasks is indicated by the absolute value of a
negative value of the semaphore count.
</para>
<para>
The semaphore <parameter>S</parameter> must also be created
(<function>sem_init(S,initial_count)</function>)
and deleted
(<function>sem_destroy(S)</function>)
somewhere. The <parameter>initial_count</parameter> is the number of
allowed <emphasis>holders</emphasis> of the semaphore lock. Usually,
that number is equal to 1, and the semaphore is called a
<emphasis>binary semaphore.<indexterm>
<primary>binary semaphore</primary></indexterm></emphasis>.
<indexterm>
 <primary>semaphore</primary><secondary>binary</secondary>
</indexterm>
The general case is called a
<emphasis>counting semaphore,<indexterm>
<primary>counting semaphore</primary></indexterm>.
<indexterm>
 <primary>semaphore</primary><secondary>counting</secondary>
</indexterm>
</emphasis>
Most operating systems offer both, because their implementations
differ only in the initialization of the semaphore's count.
</para>

<para>
From an implementation point of view, the minimum data structure of a
semaphore has two fields:
<programlisting>
struct semaphore {
  int count; // keeps the counter of the semaphore.
  queue Q;   // lists the tasks that are blocked on the semaphore.
}
</programlisting>
And (non-atomic!) pseudo code for <function>sem_wait()</function> and
<function>sem_signal()</function> (for a <emphasis>binary</emphasis>
semaphore) basically looks like this (see, for example,
<filename>upscheduler/rtai_sched.c</filename> of the &rtai; code tree
for more detailed code):
<programlisting>
semaphore S;

sem_wait(S)
{
  if (S.count > 0) then S.count = S.count - 1; 
  else block the task in S.Q;
}

sem_signal(S)
{
  if (S.Q is non-empty) then wakeup a task in S.Q;
  else S.count = S.count + 1;
}
</programlisting>
So, at each instant in time, a negative <parameter>S.count</parameter>
indicates the fact that at least one task is blocked on the semaphore;
the absolute value of <parameter>S.count</parameter> gives the number
of blocked tasks.
</para>


<para>
The semantics of the semaphore as a lock around a critical
section is exactly as in its historical railway inspiration. However,
a semaphore can also be used for different
<emphasis>synchronization<indexterm>
<primary>synchronization</primary></indexterm>
<indexterm>
<primary>semaphore</primary><secondary>synchronization</secondary>
</indexterm></emphasis> goals: if
<function>task A</function> just wants to
<emphasis>synchronize</emphasis> with <function>task B</function>,
(irrespective of the fact whether or not it needs to exclude
<function>task B</function> from entering a shared piece of code),
both tasks can use the <function>sem_wait()</function>
and <function>sem_signal()</function> function calls.
</para>

<para>
Here is a pseudo code example of two tasks <function>task A</function>
and <function>task B</function> that synchronize their mutual job by
means of a semaphore:
<programlisting>
semaphore S;

task A:                      task B:
main()                       main()
{ ...                        { ...
  do_first_part_of_job();      do_something_else_B();
  sem_signal(S);               sem_wait(S);
  do_something_else_A();       do_second_part_of_job();
  ...                          ...
}                             }
</programlisting>
</para>


<para>
Finally, note that a semaphore is a lock for which the normal
behaviour of the locking task is to go to sleep. Hence, this involves
the overhead of context switching, so don't use semaphores for critical
sections that should take only a very short time; in these cases 
<emphasis>spinlocks</emphasis> are a more appropriate choice
(<xref linkend="sect-spinlock">).
</para>

</sect2>


<sect2 id="sect-mutex">
<title>Mutex</title>
<para>
A mutex<indexterm><primary>mutex</primary></indexterm>
(<acronym>MUTual EXclusion</acronym>)<indexterm>
<primary>mutual exclusion</primary></indexterm>
is often defined as a synonym for a binary semaphore.
However, binary semaphore and mutex have an important semantic
distinction: a semaphore can be &ldquo;signaled&rdquo; and
&ldquo;waited for&rdquo; by <emphasis>any</emphasis> task, while only
the task that has <emphasis>taken</emphasis> a mutex is allowed to
release it. So, a mutex has an
<emphasis>owner<indexterm>
 <primary>mutex</primary><secondary>owner</secondary>
</indexterm></emphasis>,
as soon as it has been taken. This semantics of a mutex corresponds
nicely to its envisaged use as a lock that gives
<emphasis>only one task</emphasis> access to a critical section,
excluding all others.  That is, the task entering the critical section
<emphasis>takes</emphasis> the mutex, and
<emphasis>releases</emphasis> it when it exits the critical section.
When another task tries to take the mutex when the first
one still holds it, that other task will <emphasis>block</emphasis>.
The operating systems unblocks one waiting task as soon as the first
task releases the mutex.
This mutually exclusive access to a section of the code is often also
called <emphasis>serialization</emphasis><indexterm>
<primary>serialization</primary></indexterm>.
</para>
<para>
A &posix; mutex,<indexterm>
<primary>&posix; mutex</primary></indexterm>
<indexterm>
<primary>mutex</primary><secondary>&posix;</secondary></indexterm>
for example, is a (counting) semaphore with
<emphasis>priority inheritance</emphasis> implied (see
<xref linkend="sect-prior-inherit">).
The basic &posix; &api; for mutexes is:
<programlisting>
<![CDATA[
pthread_mutex_t lock;
int pthread_mutex_init( // Initialise mutex object:
  pthread_mutex_t *mutex,
  const pthread_mutexattr_t *mutex_attr
);

// Destroy mutex object.
int pthread_mutex_destroy(pthread_mutex_t *mutex);

// Non blocking mutex lock:
int pthread_mutex_trylock(pthread_mutex_t *mutex);

// Blocking mutex lock:
int pthread_mutex_lock(pthread_mutex_t *mutex);

// Mutex unlock:
int pthread_mutex_unlock(pthread_mutex_t *mutex);
]]>
</programlisting>
 </para>

 <para>
A <emphasis>recursive mutex</emphasis><indexterm>
<primary>recursive mutex</primary></indexterm>
<indexterm>
 <primary>mutex</primary>
 <secondary>recursive</secondary>
</indexterm>
(or <emphasis>recursive semaphore</emphasis><indexterm>
<primary>recursive semaphore</primary></indexterm>)
<indexterm>
 <primary>semaphore</primary>
 <secondary>recursive</secondary>
</indexterm>
is a mutex that can be locked repeatedly by the owner. Otherwise the
thread that holds a mutex and would try to take the mutex again would
lock itself, hence leading to a deadlock.
This recursive property is useful for complex
mutual exclusion situations, such as in <emphasis>monitors<indexterm>
<primary>monitor</primary></indexterm></emphasis>,
<xref linkend="sect-monitor">.
</para>
<para>
The &posix; &api; requires to indicate explicitly that a mutex should
be recursive:
<programlisting>
<![CDATA[
  pthread_mutexattr_settype(&mutex, PTHREAD_MUTEX_RECURSIVE);
]]>
</programlisting>
Some operating systems (e.g., &vxworks;) use the recursive mutex mode
as the default.
Some offer a so-called a <emphasis>fast</emphasis> mutex:<indexterm>
<primary>fast mutex</primary></indexterm>
<indexterm>
 <primary>mutex</primary><secondary>fast</secondary>
</indexterm>
such a mutex is locked and unlocked in the fastest manner possible on
the given operating system (i.e., it doesn't perform any error
checks). A fast mutex can only be locked one single time
by <function>pthread_mutex_lock()</function>, and
<emphasis>all</emphasis> subsequent calls cause the calling thread to
block until the mutex is freed; also the thread that holds the mutex
is locked, which causes a deadlock. So, be careful with using fast
mutexes.
</para>

 <para>
Many programmers tend to think that a semaphore is necessarily a more
primitive &rtos; function than
a mutex. This is not necessarily so, because one can implement a 
<emphasis>counting semaphore<indexterm>
<primary>counting semaphore</primary></indexterm></emphasis>
<indexterm>
 <primary>semaphore</primary><secondary>counting</secondary>
</indexterm>
with a mutex and a condition variable
(<xref linkend="sect-condvar">):
<programlisting>
<![CDATA[
int sem_wait(sem_t *sem)
{
   pthread_mutex_lock(&sem->mutex);
   while (sem->count == 0) pthread_cond_wait(&sem->cond, &sem->mutex);
   sem->count--;
   pthread_mutex_unlock(&sem->mutex);
   return(0);
}
]]>
</programlisting>
</para>

</sect2>


<sect2 id="sect-spinlock">
 <title>Spinlocks</title>
 <para>
A
&ldquo;spinlock<indexterm><primary>spinlock</primary></indexterm>&rdquo;
is the appropriate lock mechanism for multi-processor systems, and for
use in all kinds of contexts (kernel call, interrupt service routine,
etc.). They are phasing out the use of &ldquo;hard&rdquo; exclusion
methods such as <function>cli()</function> and <function>sti()</function>,
because: (i) these are too &ldquo;global&rdquo;, in the sense that they
don't specify the context in which the lock is needed; (ii) it is
usually not necessary to disable interrupts in order to protect two
tasks from entering a critical section.
However, you can not do all kinds of things when running inside
a critical section locked by a spinlock! For example, do nothing that
can take a &ldquo;long&rdquo; time, or that can sleep. Use semaphores
or mutexes for this kind of locks.
</para>
<para>
The task that wants to get a spinlock tries to get a lock
that is shared by all processors. If it doesn't get the lock, it keeps
trying (&ldquo;<emphasis>busy waiting<indexterm>
<primary>busy waiting</primary></indexterm>
<indexterm>
 <primary>lock</primary><secondary>busy waiting</secondary>
</indexterm></emphasis>&rdquo;) till it succeeds:
<programlisting>
int spinlock(spinlock_t l){
 while test_and_set(l) {};
};
</programlisting>
So, it's clear why you shouldn't do things that take a long time
within a spinlock context: another task could be busy waiting for you
all the time!
An example of a spinlock in the &linux; kernel is the
&ldquo;Big Kernel Lock<indexterm>
<primary>Big Kernel Lock</primary></indexterm>&rdquo;
(<xref linkend="arch">): the
BKL<indexterm><primary>BKL</primary></indexterm> is a
<emphasis>recursive</emphasis> spinlock,<indexterm>
<primary>recursive spinlock</primary></indexterm>
<indexterm>
 <primary>lock</primary><secondary>recursive</secondary>
</indexterm>
i.e., it can be locked multiple times recursively. That means that 
you (possibly in two separate tasks) can lock it twice in a row, but
you also have to release it twice after that.
</para>

<para>
Spinlocks come in three versions:
<orderedlist>

<listitem>
<para>
<function>spin_lock</function> and <function>spin_unlock</function>:
the classical mutual exclusion version, allowing interrupts to occur
while in the critical section.
</para>
</listitem>

<listitem>
<para>
<function>spin_lock_irq</function> and
<function>spin_unlock_irq</function>: as above, but with interrupts
disabled.
</para>
</listitem>

<listitem>
<para>
<function>spin_lock_irqsave</function> and
<function>spin_unlock_irqrestore</function>: as above, but saving the
current state flag of the processor.
</para>
</listitem>

</orderedlist>
All of them work on (the address of) variables of the type
<parameter>spinlock_t</parameter>. One should call
<function>spin_lock_init()</function> before using the lock.
The spinlock versions that disable interrupts do
<emphasis>not</emphasis> disable interrupts on the
<emphasis>other</emphasis> &cpu;s than the one the calling task is
running on, in order not to bring down the
throughput of the whole multi-processor system. An example (&linux;
specific!) of the usage (not the implementation!) of a spinlock with
local interrupt disabling is given here:
<programlisting>
<![CDATA[
spinlock_t l = SPIN_LOCK_UNLOCKED;
unsigned long flags
spin_lock_irqsave(&l, flags);
/* critical section ... */
spin_unlock_irqrestore(&l, flags);
]]>
</programlisting>
So, both the concurrency and the multi-processor issues are dealt
with. On a uni-processor system, this should translate into:
<programlisting>
unsigned long flags;
save_flags(flags);
cli();
/* critical section ... */
restore_flags(flags);
</programlisting>
Note: the &posix; function <function>pthread_spin_lock()</function> has
this semantics of disabling interrupts.
 </para>

 <para>
Spinlocks are a trade-off between (i) disabling all interrupts on all
processors (costly, safe, but what you don't want to do on a
multi-processor system or a pre-emptable kernel), and (ii) wasting
time in busy waiting (which is the only alternative that remains).
So, spinlocks work if the programmer is disciplined enough to use them
with care, that is for guaranteed <emphasis>very</emphasis> short
critical sections. In principle, the latency induced by a spinlock is
<emphasis>not</emphasis> deterministic, which is in contradiction to
its use for real-time. But they offer a good solution in the case that
the scheduling and context switching times generated by the use of
locks, are larger than the time required to execute the critical
section the spinlock is guarding.
 </para>

 <para>
There is a reason why atomic <emphasis>test-and-set</emphasis>
operations are not optimal on multi-processor systems built from
typical PC architecture processors: the
<emphasis>test-and-set</emphasis> performed by one processor can make
parts of the caches on the other processors invalid because part of
the operation involves <emphasis>writing</emphasis> to memory. And
this cache invalidating lowers the benefits to be expected from
caching. But the following implementation can help a bit:
<programlisting>
int spinlock(spinlock_type l){ 
  while test_and_set(l) { // enter wait state if l is 1
    while (l == 1) {}     // stay in wait state until l becomes 0
  };
};
</programlisting>
The difference with the previous implementation is that the
<function>test_and_set()</function> requires a read <emphasis>and</emphasis>
a write operation (which
<emphasis>has</emphasis> to block memory access for other &cpu;s),
while the test <function>l == 1</function> requires only a read, which
can be done from cache.
</para>

</sect2>

<sect2 id="sect-readwrite-lock">
<title>Read/write locks<indexterm>
<primary>read/write lock</primary></indexterm>
<indexterm>
 <primary>lock</primary><secondary>read/write</secondary>
</indexterm>
</title>
<para>
Often, data has only to be
protected against concurrent writing, not concurrent reading. So, many
tasks can get a read lock at the same time for the same critical
section, but only one single task can get a write lock. Before this
task gets the write lock, all read locks have to be released. Read
locks are often useful to access complex data structures like linked
lists: most tasks only read through the lists to find the element they
are interested in; changes to the list are much less common.
(See also <xref linkend="sect-swinging-buf">.)
</para>
<para>
&linux; has a reader/writer spinlock (see below), that is used
similarly to the standard spinlock, with the exception of separate
reader/writer locking:
<programlisting>
<![CDATA[
rwlock_t rwlock = RW_LOCK_UNLOCKED; // initialize

read_lock(&rwlock);
/* critical section (read only) ... */ 
read_unlock(&rwlock);

write_lock(&rwlock);
/* critical section (read and write) ... */
write_unlock(&_rwlock);
]]>
</programlisting>
Similarly, &linux; has a
<emphasis>read/write semaphore</emphasis>.<indexterm>
<primary>read/write semaphore</primary></indexterm>
<indexterm>
 <primary>semaphore</primary><secondary>read/write</secondary>
</indexterm>
</para>

</sect2>

<sect2 id="sect-barrier">
<title>Barrier<indexterm>
<primary>barrier</primary></indexterm>
<indexterm>
 <primary>lock</primary><secondary>barrier</secondary>
</indexterm>
</title>
<para>
Sometimes it is necessary to synchronize a lot of threads, i.e., they
should wait until <emphasis>all</emphasis> of them have reached a
certain &ldquo;barrier.&rdquo; A typical implementation initializes
the barrier with a counter equal to the number of threads, and
decrements the counter whenever one of the threads reaches the barrier
(and blocks).  Each decrement requires synchronization, so the barrier
cost scales linearly in the number of threads.
</para>

<para>
&posix; (1003.1-2001) has a
<function>pthread_barrier_wait()</function> function,
and a <parameter>pthread_barrier_t</parameter> type.
&rtai; has something similar to a barrier but somewhat more flexible,
which it calls &ldquo;<emphasis>bits</emphasis>&rdquo;<indexterm>
<primary>bits (&rtai;)</primary>
</indexterm>
(see file <filename>bits/rtai_bits.c</filename> in the &rtai; source
tree), and what some other operating systems call
<emphasis>flags<indexterm><primary>flag</primary>
</indexterm></emphasis>
or <emphasis>events<indexterm><primary>event</primary>
</indexterm></emphasis>.
The <function>bits</function> is a 32 bit value, that tasks
can share to encode any kind of AND or OR combination of binary flags.
It can be used as a barrier for a set of tasks, by initializing the
bits corresponding to each of the tasks to &ldquo;1&rdquo; and
letting each task that reaches the barrier reset its bit to
&ldquo;0&rdquo;. This is similar to a semaphore (or rather, an array
of semaphores), but it is not &ldquo;counting&rdquo;.
</para>


</sect2>

</sect1>


<sect1 id="sect-condvar">
<title>Condition variable for synchronization within mutex</title>
<para>
Condition variables<indexterm>
<primary>condition variable</primary></indexterm>
have been introduced for two reasons (which amount basically to one
single reason):
<orderedlist>

<listitem>
<para>
It allows to make a task sleep until a certain
<emphasis>application-defined logical criterium</emphasis> is
satisfied.
</para>
</listitem>
<listitem>
<para>
It allows to make a task sleep <emphasis>within</emphasis> a critical
section. (Unlike a semaphore.) This is in fact the same reason as
above, because the critical section is needed to evaluate the
application-defined logical criterium atomically.
</para>
</listitem>
</orderedlist>
The solution to this problem is well known, and consists of the
<emphasis>combination</emphasis> of three things:
<orderedlist>
<listitem>
<para>
A <emphasis>mutex lock</emphasis> (see <xref linkend="sect-mutex">).
</para>
</listitem>
<listitem>
<para>
A <emphasis>boolean expression</emphasis>, which represents the
above-mentioned logical criterium.
</para>
</listitem>
<listitem>
<para>
A <emphasis>signal</emphasis> (see <xref linkend="sect-signal">), that
other tasks can fire to wake up the task blocked in the condition
variable, so that it can re-check its boolean expression.
</para>
</listitem>
</orderedlist>
The lock allows to check the boolean expression
&ldquo;atomically&rdquo; in a critical section, and to wait for the
signal within that critical section. It's the operating system's
responsibility to release the mutex behind the back of the task, when
it goes to sleep in the wait, and to take it again when the task is
woken up by the signal.
</para>
<para>
There exists a &posix; standard for condition variables.
Here are some of the major
prototypes for the <function>pthread_cond_wait()</function>
system call, used to make a task wait for its wake-up signal:
<programlisting>
<![CDATA[
#include <pthread.h>

   // Initialise condition attribute data structure:
int pthread_condattr_init(pthread_condattr_t *attr);

  // Destroy condition attribute data structure:
int pthread_condattr_destroy(pthread_condattr_t *attr);

   // Initialise conditional variable:
int pthread_cond_init(     
  pthread_cond_t *cond,
  const pthread_condattr_t *cond_attr
);

   // Destroy conditional variable:
int pthread_cond_destroy(pthread_cond_t *cond);

   // Wait for condition variable to be signaled:
int pthread_cond_wait(
  pthread_cond_t *cond,
  pthread_mutex_t *mutex
);

   // Wait for condition variable to be signaled or timed-out:
int pthread_cond_timedwait(
  pthread_cond_t *cond,
  pthread_mutex_t *mutex,
  const struct timespec *abstime
);

   // Restart one specific waiting thread:
int pthread_cond_signal(pthread_cond_t *cond);

   // Restart all waiting threads:
int pthread_cond_broadcast(pthread_cond_t *cond);
]]>
</programlisting>
Others system calls that take the same arguments are:
<function>pthread_cond_init()</function> (initialize the data
structure with which a condition variable is built),
<function>pthread_cond_signal()</function> (signal the fact that a
condition variable has changed state),
<function>pthread_cond_broadcast()</function> (signals the state
change to <emphasis>all</emphasis> tasks that are waiting for the
signal, and wakes them all),
<function>pthread_cond_timedwait()</function> (wait for the signal, or
for a timer to expire, whichever comes first).
</para>
<para>
The <function>sem_wait()</function> of <xref linkend="sect-mutex">
shows a typical application of a condition variable. We repeat the
code here for convenience:
<programlisting>
<![CDATA[
int sem_wait(sem_t *sem)
{
   pthread_mutex_lock(&sem->mutex);
   while (sem->count == 0) pthread_cond_wait(&sem->cond, &sem->mutex);
   sem->count--;
   pthread_mutex_unlock(&sem->mutex);
   return(0);
}
]]>
</programlisting>
The semaphore has a mutex <parameter>sem->mutex</parameter>, a
condition signal <parameter>sem->cond</parameter>, and its particular
boolean expression, namely its <parameter>count</parameter> being zero
or not. The checking of this condition, as well as the possible
decrement of the <parameter>count</parameter>, must be done in a
critical section, in order to synchronize access to the semaphore with
other tasks.
The <function>pthread_cond_wait()</function> function makes the
calling task block on the condition variable if the boolean expression
evaluates to false. The operating system releases the mutex when the
task must block, so that other tasks can use the semaphore. When the
condition is signaled (this is done by the complementary function
<function>sem_signal()</function>, which is not given here but that
executes a <function>pthread_cond_broadcast()</function>), the calling
task is woken up and its mutex is activated (all in one atomic
operation) such that the woken-up task can safely access the
critical section, i.e., check its boolean expression again.
The above-mentioned atomicity is guaranteed by the operating system,
which itself uses some more internal locks in its implementation of
the <function>pthread_cond_wait()</function> call.
</para>
<para>
It is essential that tasks that wake up from waiting on a condition
variable, <emphasis>re-check</emphasis> the boolean expression for
which they were waiting, because nothing guarantees that it is still
true at the time of waking up. Indeed, a task can be scheduled a long
time after it was signaled. So, it should also be prepared to wait
again. This leads to the almost inevitable <function>while</function>
loop around a <function>pthread_cond_wait()</function>. 
</para>
<para>
The <function>pthread_cond_broadcast()</function> should be the
default way to signal the condition variable, and not
<function>pthread_cond_signal()</function>. The latter is only an
<emphasis>optimization</emphasis> in the case that one knows for sure
that only one waiter must be woken up. However, this optimization
violates the <emphasis>loose coupling</emphasis> principle of good
software design (<xref linkend="chap-design">): if the application is
changed somewhat, the &ldquo;optimization&rdquo; of before could well
become a bottleneck, and solving the situation involves looking for
the <function>pthread_cond_signal()</function> calls that can be
spread over various files in the application.
</para>
<para>
However, blindly using <function>pthread_cond_broadcast()</function>
can also have a negative effect, called the
&ldquo;<emphasis>thundering herd<indexterm>
<primary>thundering herd</primary></indexterm></emphasis>&rdquo;
problem: <function>pthread_cond_broadcast()</function> can wake up a
large number of tasks, and in the case that only one task is needed to
process the broadcast, all other woken-up tasks will immediately go to
sleep again. That means the scheduler is hidden under a
&ldquo;herd&rdquo; of unnecessary wake-up and sleep calls.
So, &linux; and other operating systems introduced policies that
programmers can use to give some tasks the priority in wake-ups.
</para>
<para>
Both semaphores/mutexes and condition variables can be used for
<emphasis>synchronization</emphasis> between tasks.
However, they have some basic differences:
<orderedlist>
<listitem>
<para>
Signaling a semaphore has <emphasis>always</emphasis> an effect on the
semaphore's internal count. Signaling a condition variable can
sometimes have no effect at all, i.e., when no task is waiting for it.
</para>
</listitem>
<listitem>
<para>
A condition variable can be used to check an
<emphasis>arbitrary complex</emphasis> boolean expression.
</para>
</listitem>
<listitem>
<para>
According to the &posix; rationale, a condition variable can be used
to make a task wait <emphasis>indefinitely long</emphasis>, but
spinlocks, semaphores and mutexes are meant for shorter waiting
periods. The reason is that <function>pthread_mutex_lock()</function>
is not a <emphasis>cancelling point<indexterm>
<primary>cancelling point</primary></indexterm></emphasis>, while the
<function>pthread_cond_wait()</function> is.
</para>
</listitem>
<listitem>
<para>
A condition variable is nothing more than a notification to a task
that the condition it was waiting for <emphasis>might</emphasis> have
changed. And the woken-up task <emphasis>should</emphasis> check that
condition again before proceeding. This check-on-wake-up policy is not
part of the semaphore primitive.
</para>
</listitem>
</orderedlist>
</para>

</sect1>


<sect1 id="sect-prior-inv">
<title>Priority inversion</title>
<para>
<indexterm>
<primary>priority inversion</primary></indexterm>
<indexterm><primary>inversion</primary>
<secondary>priority</secondary></indexterm>
Priority scheduling and locks are, in fact, contradictory OS
primitives: priority scheduling wants to run the highest priority job
first, while a mutex excludes <emphasis>every</emphasis> other job
(so, also the highest priority job) from running in a critical section
that is already entered by another job. And these contradictory goals
lead to tricky trade-offs. For example,
everybody coding multi-tasking systems using priority-based task
scheduling and locking primitives should know about the
&ldquo;priority inversion&rdquo; danger: in some situations, the use
of a lock prevents a task to proceed because it has to wait for a
lower-priority task. The reason is that a low-priority task (i) is in
a critical section for which it holds the lock that blocks the
high-priority task, and (ii) it is itself pre-empted by a
medium-priority task that has nothing to do with the critical section
in which the high- and low-priority tasks are involved. Hence, the
name  &ldquo;priority inversion&rdquo;: a medium-priority job runs
while a high-priority task is ready to proceed. The simplest case is
depicted in <xref linkend="fig-prior-inv">. In that Figure,
<function>task H</function> is the high-priority task,
<function>task M</function> the medium-priority task, and
<function>task L</function> the low-priority task. At time instant
<parameter>T1</parameter>, <function>task L</function> enters the
critical section it shares with <function>task H</function>. At
time <parameter>T2</parameter>, <function>task H</function> blocks
on the lock issued by <function>task L</function>. (Recall that it
cannot pre-empt <function>task L</function> because that task has
the lock on their common critical section.)  At time
<parameter>T3</parameter>, <function>task M</function> pre-empts the
lower-priority task <function>task L</function>, and <emphasis>at
the same time</emphasis> also the higher-priority
<function>task H</function>. At time <parameter>T4</parameter>,
<function>task M</function> stops, and <function>task L</function>
gets the chance again to finish the critical section code at time
<parameter>T5</parameter> when, at last, <function>task H</function>
can run.

<figure id="fig-prior-inv" float="1" pgwide="0">
<title>
 Priority inversion.
</title>
<mediaobject>
<imageobject>
<imagedata fileref="rthowtofigs/prior-inv.png" format="PNG">
</imageobject>
<imageobject>
<imagedata fileref="rthowtofigs/prior-inv.eps" format="EPS">
</imageobject>
</mediaobject>
</figure>

The best-known practical case of a priority inversion problem occurred
during the Mars Pathfinder<indexterm>
<primary>Pathfinder</primary></indexterm>
<indexterm> <primary>Mars Pathfinder</primary></indexterm>
mission in 1997. (More information about this story can be found at
<ulink url="http://www.kohala.com/start/papers.others/pathfinder.html">
http://www.kohala.com/start/papers.others/pathfinder.html</ulink> or
<ulink
url="http://research.microsoft.com/~mbj/Mars_Pathfinder/Mars_Pathfinder.html">
http://research.microsoft.com/~mbj/Mars_Pathfinder/Mars_Pathfinder.html</ulink>.)
</para>

</sect1>


<sect1 id="sect-prior-inherit">
<title>Priority inheritance and priority ceiling</title>
<para>
<indexterm><primary>priority inheritance</primary></indexterm>
<indexterm><primary>priority ceiling</primary></indexterm>
Operating system programmers have tried to &ldquo;solve&rdquo; (not
prevent) the priority inversion problem, in two different ways:
<itemizedlist>
 <listitem>
 <para>
<emphasis>Priority inheritance.</emphasis> 
A low-priority task that holds the lock requested by a high-priority
task temporarily &ldquo;inherits&rdquo; the priority of that
high-priority task, <emphasis>from the moment the high-priority task
does the request</emphasis>. That way, the low-priority task will not
be pre-empted by medium-level priority tasks, and will be able to
finish its critical section without holding up the high-priority task
any longer than needed. When it releases the lock, its priority drops
to its original level, while the high-priority task will now get the
lock. The maximum predictable delay is the length of the critical
section of the low-priority task.
 </para>
 <para>
Priority inheritance generates <emphasis>run-time</emphasis> overhead,
because the scheduler has to inspect the priorities of all tasks that
access a lock.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Priority ceiling.</emphasis> Every lock gets a priority
level corresponding to the priority of the highest-priority task that
<emphasis>can</emphasis> use the lock.
This level is called the <emphasis>ceiling priority</emphasis>.
Note that it is the <emphasis>lock</emphasis> that gets
a priority, which it gives to every task that tries the lock.
So, when the low-priority task enters the critical section, it
<emphasis>immediately</emphasis> gets the ceiling priority from the
lock, such that it will not be pre-empted by any medium-level priority
task. Therefore, another name of the priority ceiling protocol is
<emphasis>instant inheritance</emphasis>.<indexterm>
<primary>instant inheritance</primary></indexterm>
<indexterm><primary>inheritance</primary>
<secondary>instant</secondary></indexterm>
</para>
<para>
Priority ceiling generates <emphasis>compile-time</emphasis> overhead,
because it can already at that moment check the priorities of all
tasks that will request a lock.
</para>
<para>
Priority ceiling has the pleasant property that it simplifies
implementation and has small run-time overhead (only the change in
priority for the task entering a critical section): the lock
<emphasis>never has to be tested</emphasis> for being free or not,
because any task that tries the lock runs at the highest priority to enter
the critical section: any other task that could test the lock would run at
the same ceiling priority, and hence would not have been interrupted in its
critical section by the task that currently tests the lock. Indeed, both
tasks live in the same priority level and are scheduled with a
<parameter>SCHED_FIFO</parameter><indexterm>
<primary>SCHED_FIFO</primary></indexterm>
policy. Instant inheritance also offers a solution to the
&ldquo;deadly embrace&rdquo;<indexterm>
<primary>deadly embrace</primary></indexterm><indexterm>
<primary>race condition</primary>
<secondary>deadly embrace</secondary></indexterm>
(see <xref linkend="sect-race">)
occurring when two tasks lock <emphasis>nested</emphasis> critical
sections in opposite order: the first task to enter the outermost lock
will have the appropriate priority to finish the complete set of
nested critical sections.
 </para>
<para>
A possible problem with priority ceiling is that it makes more
processes run at higher priorities, for longer times than necessary.
Indeed, the priorities of tasks are changed,
<emphasis>irrespective</emphasis> of the fact whether another task
will try to request the lock or not. This reduces the discriminating
effects of using priorities is the first place,
<emphasis>and</emphasis> it gives rise to &ldquo;hidden&rdquo;
priority inversion:<indexterm>
<primary>hidden priority inversion</primary></indexterm>
<indexterm>
<primary>priority inversion</primary>
<secondary>hidden</secondary></indexterm>
while task <parameter>L</parameter> gets its
priority raised to the ceiling priority because it is involved in a
lock with another task <parameter>V</parameter> that has a very high
priority, a third task <parameter>H</parameter> not involved in the
lock could get pre-empted by <parameter>L</parameter> although its
priority is higher and <parameter>V</parameter> is dormant most of the
time.
</para>
 </listitem>
</itemizedlist>
Priority ceiling and inheritance look great at first sight, and they
are part of some OS standards: priority ceiling is in the &posix;
standard (<parameter>POSIX_PRIO_PROTECT</parameter>), the Real-Time
Specification for Java (RTSJ), &osek;, and the Ada 95 real-time
specifications. Priority inheritance is also part of standards such as
POSIX (<parameter>POSIX_PRIO_INHERIT</parameter>), and the RTSJ.
But priority ceiling and inheritance
can still not <emphasis>guarantee</emphasis> that no inversion or
indeterministic delays will occur, <citation>Yodaiken2002</citation>,
<citation>Locke2002</citation>. Morever, 
the priority inheritance &ldquo;feature&rdquo; gives rise to code that
is more complex to understand and certainly to predict. Also,
determining <emphasis>a priori</emphasis> the ceiling priority for a
lock is not an easy matter (the compiler must have access to
<emphasis>all</emphasis> code that can possibly use a lock!), and can
cause portability and extendability headaches.
</para>
<para>
Priority inversion is always a result of a
<emphasis>bad design</emphasis>, so it's much better to
<emphasis>prevent</emphasis> race conditions instead of
&ldquo;solving&rdquo; them. However, contrary to the deadlock
prevention algorithm (<xref linkend="sect-race">),
no similarly simple and guaranteed algorithm for priority
inversion is known. So, all an operating system could do to help
the programmers is signalling when priority inversion takes place,
such that they can improve their design.
</para>
<para>
Most &rtos;es don't apply priority inversion solutions for every case
of sender-receiver synchronization.  For example
<application>Neutrino</application><indexterm>
<primary>Neutrino</primary></indexterm>
(from &qnx;)<indexterm>
<primary>&qnx;</primary></indexterm>
uses separate synchronization mechanisms for critical sections
(semaphores) and sender-receiver (which is synchronous &ipc; in &qnx;).
It solves priority inversion only so long as applications use a
many-to-one &ipc;<indexterm>
<primary>IPC</primary>
<secondary>many-to-one</secondary></indexterm>
<indexterm><primary>many-to-one IPC</primary></indexterm>
model. As soon as an application uses many-to-many &ipc;<indexterm>
<primary>IPC</primary>
<secondary>many-to-many</secondary></indexterm>
<indexterm><primary>many-to-many IPC</primary></indexterm>
(via a &posix; queue) there is no more prevention of priority
inversion. Many-to-many is inherently difficult because the kernel has
no way to know which receiver might be ready next, so all it could do
would be to raise the priority of all potential listeners (and the
processes upon which they are waiting). And this would often result in
a logjam as every process was raised to the same priority,
invalidating exactly the major reason why priorities were introduced
in the first place.
</para>

</sect1>


<sect1 id="sect-lock-free">
<title>Lock-free synchronization for data exchange</title>
<para>
Some synchronization can also be done <emphasis>without</emphasis>
locks, and hence this is much more efficient and guaranteed to be
deadlock-free, <citation>Herlihy91</citation>,
<citation>Herlihy93</citation>.
Lock-free synchronization uses the
<function>compare_and_swap(address,old,new)</function><indexterm>
<primary><function>compare_and_swap</function></primary></indexterm>
 (see
<xref linkend="sect-atomic">), or similar constructs.
This functionality is applicable to the manipulation of
<emphasis>pointers</emphasis>, e.g., to interchange two buffers in one
atomic operation, or to do linked list, queue or stack operations. 
</para>
<para>
The following code fragment shows the basic form of this
<emphasis>pointer swinging</emphasis>:
<programlisting>
ptr = ...
do {
  old = ptr;
  new = new_value_for_pointer;
while ( !compare_and_swap(ptr,old,new) );
</programlisting>
If the <function>compare_and_swap()</function> returns
&ldquo;false&rdquo;, the swinging of the pointers should not be done,
because some other task has done something with the pointer in the
meantime.
 </para>

 <para>
Recall the possible problem with
<function>compare_and_swap()</function>: it only compares the
<emphasis>values</emphasis> of the addresses, not whether this value
has been changed!  This means that a double change of the pointer (back
to its original value) will not be detected. This occurs quite
frequently, i.e., any time when memory space is re-used, e.g., in a
stack or a linked list.
</para>
<para>
Another problem of using <function>compare_and_swap</function> for
lock-free synchronization is that it is not always the most efficient
method available, because it involves the <emphasis>copying</emphasis>
of a complete data structure before that data structure can be updated
without a lock.
</para>

</sect1>


</chapter>


<chapter id="ipc-dataexchange">
<title>IPC: Data exchange</title>

<para>
<indexterm><primary>data exchange</primary></indexterm>
<indexterm><primary>&ipc;</primary><secondary>data exchange</secondary>
</indexterm>
The previous Chapter looked at the
<emphasis>synchronization</emphasis> aspect of &ipc;; this Chapter
deals with the mechanisms and policies of
<emphasis>data exchange</emphasis>. The emphasis is on data exchange
for real-time systems.
</para>
<para>
The <emphasis>mechanism</emphasis><indexterm>
<primary>mechanism</primary></indexterm>
<indexterm>
<primary>&ipc;</primary><secondary>mechanism</secondary>
</indexterm>of all data exchange &ipc; is quite similar: the operating
system has some memory space reserved for the data that has to be
exchanged, and uses some sychronization &ipc; primitives for reading
or writing to that memory space. There is some
&ldquo;object&rdquo; responsible for the memory and the locks; we call
this object the <emphasis>mediator</emphasis>,<indexterm>
<primary>mediator</primary></indexterm>
<indexterm>
<primary>&ipc;</primary><secondary>mediator</secondary>
</indexterm>
<indexterm>
<primary>data exchange</primary><secondary>mediator</secondary>
</indexterm>
(<xref linkend="sect-mediator">)
or the
<emphasis>channel</emphasis>.<indexterm>
<primary>channel</primary></indexterm>
<indexterm>
<primary>&ipc;</primary><secondary>channel</secondary>
</indexterm>
<indexterm>
<primary>data exchange</primary><secondary>channel</secondary>
</indexterm>
The mediator is really the heart of the data exchange: the &ipc;
clients make function calls on it, but it's the mediator that takes
care of memory allocation, buffering, locking, signalling, etc.
Although a mediator is more of an <emphasis>object-oriented
design</emphasis> concept, it is there already in most of the old
&ccc; code of operating systems. The bad news is that most operating
system designers didn't realize that, and hence, they didn't reuse the
mediator code when implementing the myriads of &ipc; forms they
developed&hellip;
</para>

<para>
It's especially in their <emphasis>policy</emphasis><indexterm>
<primary>policy</primary></indexterm>
<indexterm>
<primary>&ipc;</primary><secondary>policy</secondary>
</indexterm>(i.e., the choice of <emphasis>how</emphasis> the
low-level mechanism is being used) that the different forms of data
exchange differ from each other. Below is a non-exhaustive list of
policies for data exchange. Almost every possible combination of
options is feasible, so it should come as no surprise that operating
systems tend to have data exchange primitives with not quite the same
&api;&hellip;
<itemizedlist>
 <listitem>
 <para>
<emphasis>(No) Data loss.</emphasis> Whether or not everything that the
&ldquo;sender&rdquo; sends to the &ldquo;receiver&rdquo; (or rather,
to the &ipc; mediator object) will indeed be received by the
&ldquo;receiver&rdquo;.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>(Non)Blocking.</emphasis> (Also called
<emphasis>(a)synchronous</emphasis>.)<indexterm>
<primary>synchronous</primary></indexterm>
<indexterm><primary>asynchronous</primary></indexterm>
<indexterm><primary>blocking</primary></indexterm>
<indexterm><primary>&ipc;</primary><secondary>blocking</secondary>
</indexterm>
The &ldquo;sender&rdquo; and/or
&ldquo;receiver&rdquo; block until the exchange is finished.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>One-to-many/many-to-one/many-to-many/one-to-one.</emphasis>
There is one single
&ldquo;sender&rdquo; and multiple &ldquo;receivers&rdquo;. Or any
variation on this theme.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Named/anonymous.</emphasis> The &ldquo;sender&rdquo; must
explicitly give the identification of the &ldquo;receivers&rdquo;, or
it must only name the mediator. 
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>(Non)Buffered.</emphasis> The &ldquo;sender&rdquo; sends
some data to the &ldquo;receiver&rdquo;, but only
<emphasis>indirectly</emphasis>: the data is stored in a buffer, from
which the &ldquo;receiver&rdquo; reads at its own leisure.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>(Non)Prioritized.</emphasis> A message can get a priority,
and the highest priority message gets delivered first. Similarly, the
tasks that wait for a data exchange (hence, which are blocked by the
lock of the mediator), can be woken up according to their static
priority.
 </para>
 </listitem>
</itemizedlist>
</para>

<para>
There does exist some standardization in data exchange &ipc;:
&posix; has its standard 1003.1b, in which it specifies message
queues, with 32 priorities and priority-based task queues on their
locks.
</para>


<sect1 id="sect-shared-mem">
<title>Shared memory</title>
<para>
<indexterm><primary>shared memory</primary></indexterm>
Two (or more) tasks can exchange information by reading and writing
the same area in memory. 
The main advantage is that the data exchange can take place with
<emphasis>zero copying</emphasis>,<indexterm>
<primary>zero copying</primary></indexterm>
<indexterm>
<primary>&ipc;</primary>
<secondary>zero copying</secondary>
</indexterm>
<indexterm>
<primary>data exchange</primary>
<secondary>zero copying</secondary>
</indexterm>
because the &ldquo;buffer&rdquo; is just one deep. For the rest, any
policy can be implemented on top of shared memory. One area where
shared memory is very popular is for the data exchange with peripheral
hardware, if possible under
<emphasis>Direct Memory Access (DMA)</emphasis>.<indexterm>
<primary>Direct Memory Access</primary></indexterm>
<indexterm><primary>DMA</primary></indexterm>
</para>
<para>
Avalaible RAM memory is the only limit to the number of independent shared
memory &ipc; &ldquo;channels&rdquo;. The shared memory must be reserved
from the operating system, and locked into RAM.
(See <xref linkend="sec-mem-general">.)
If the tasks
involved in the shared memory &ipc; want to know how &ldquo;fresh&rdquo;
the data in the shared segment is, they have to implement their own
handshake protocol themselves, because the operating system gives no
indication as to whether data from shared memory has already been
accessed or not. One common approach is to put a counter in the shared
memory data structure, that indicates how many writes have already
taken place. This counter could be a time stamp, which is, in
addition, particularly useful for an asynchronous monitor task in user
space: that task, for example, plots the data from the real-time task
at its own rate, and the time stamps provide a way to keep the data
plotted on a correct time line.
</para>

</sect1>


<sect1 id="sect-fifo">
<title>&fifo;s</title>
<para>
<indexterm><primary>&fifo;</primary></indexterm>
Shared memory has the properties of a so-called
<emphasis>block device</emphasis>:<indexterm>
<primary>block device</primary></indexterm>
programs can access arbitrary blocks on the device, in
any sequence. <emphasis>Character devices</emphasis>,<indexterm>
<primary>character device</primary></indexterm>
on the other hand, can access the data only in a specified linear
sequence.
A &fifo; (<emphasis>First-in, First-Out</emphasis>) is such a character
device &ipc;: its mediator policy is
<emphasis>loss-free</emphasis>, <emphasis>non-blocking</emphasis>
(unless the &fifo; is empty or full),
in principle
<emphasis>many-to-many</emphasis> but in practice often
<emphasis>1-to-1</emphasis> (i.e., only one sender and one receiver), and
<emphasis>buffered</emphasis> (i.e., &fifo;s put data in a
<emphasis>pipeline</emphasis>, where the sender adds data on one end,
and the reader reads it at the other end). 
</para>
<para>
Some &fifo; implementations support blocking for
synchronization at the reader's end, so the reader gets woken up as
soon as new data has arrived. &fifo;s that implement blocking have a
&ldquo;task queue&rdquo; data structure in which blocked tasks can
wait. 
</para>
<para>
&fifo;'s often also allow <emphasis>asynchronous</emphasis> data
exchange: a task can register a <emphasis>&fifo; handler<indexterm>
<primary>&fifo; handler</primary></indexterm>
<indexterm>
 <primary>handler</primary><secondary>&fifo;</secondary>
</indexterm></emphasis>
that the operating system executes after data has been put into the
&fifo;. This is done with exactly the same &isr;-&dsr; principle as
for hardware and software interrupts (<xref linkend="sect-idsr">): the
writing of data into the &fifo; also fires an event that activates the
&fifo; handler; this handler will be a <emphasis>tasklet<indexterm>
<primary>tasklet</primary></indexterm></emphasis>
(<xref linkend="sect-idsr">), that, just as in the case of an
interrupt, is executed by the operating system before it does its next
scheduling.
</para>
<para>
The boundaries between successive data in the &fifo; need 
not necessarily be sharp, because different blocks might have different
sizes. However, in that case, one speaks more often about
<emphasis>mailboxes</emphasis>, see <xref linkend="sect-mess">.
</para>
<para>
The <emphasis>mediator<indexterm>
<primary>mediator</primary><secondary>&fifo;</secondary>
</indexterm></emphasis> 
implementing the &fifo; uses a lock for mutual exclusion during read
or write, and in order to keep the &fifo;'s task queue data structures
consistent. However, if the &fifo; runs between a real-time task and a
user space task, the locking problem is very much simplified: the
real-time task can never be interrupted by the user task (because it
runs in kernel space) so no lock is needed at the real-time side.
</para>
</sect1>


<sect1 id="sect-mess">
<title>Messages and mailboxes</title>
<para>
<indexterm><primary>message</primary></indexterm>
<indexterm><primary>mailbox</primary></indexterm>
Messages and mailboxes allow the sender to send data in
<emphasis>arbitrary chunks</emphasis>, only
limited by available memory in the buffer. The message contains some
&ldquo;meta-information&rdquo; about the size and sender of the
message; or whatever data that the message protocol prescribes. 
This meta-information is, in fact, the only practical difference with
&fifo;s. From an implementation point of view, &fifo;s, messages and
mailboxes all look very similar, in the sense that there is the
<emphasis>mediator<indexterm>
<primary>mediator</primary><secondary>mailbox</secondary>
</indexterm></emphasis> object that takes
care of the buffer, the locks and the queues of waiting tasks.
Moreover, many operating systems make no distinction between
messages and mailboxes. If they do make a distinction, it is the
following:
<itemizedlist>
 <listitem>
 <para>
<emphasis>Message.</emphasis> The sender puts its message in a memory
space it has allocated itself, and then sends the
<emphasis>address</emphasis> of that memory to the OS, together with
an identification of the receiver. The receiver asks the OS whether
there are messages for it, and decides to read them or not. Reading 
a message is done in the same place as where it was written.  If
desired, a counter on the message data allows for
<emphasis>1-to-many</emphasis> &ipc;.
 </para>
 <para>
Be careful with this oversimplified description: passing the address
of a data chunk is error-prone (multi-processor systems; virtual
memory; &hellip;).
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>Mailbox.</emphasis> The sender notifies the OS that it has a
message, and gives the identification of the receiver. The OS then
<emphasis>copies</emphasis> the message to the mailbox of the
receiver; this mailbox is a buffer managed by the <emphasis>operating
system</emphasis>. The receiver task reads the messages in the
order of arrival.  (Of course, variations on this policy exist.)
 </para>
 </listitem>

</itemizedlist>
A natural extension to messages or mailboxes is
<emphasis>synchronous message passing<indexterm>
<primary>synchronous message passing</primary></indexterm>
<indexterm>
 <primary>message</primary><secondary>synchronous</secondary>
</indexterm></emphasis>,
sometimes also called
<emphasis>Send/Receive/Reply</emphasis><indexterm>
<primary>Send/Receive/Reply</primary></indexterm>
<indexterm>
<primary>message</primary><secondary>Send/Receive/Reply</secondary>
</indexterm> (because that's what it is called in &qnx;):<indexterm>
<primary>&qnx;</primary></indexterm>
the &ldquo;sender&rdquo; sends a
message, and waits until the &ldquo;receiver&rdquo; has acknowledged the
reception of the message. Hence, <emphasis>two</emphasis> messages are
exchanged in this form of &ipc;.
The <ulink url="http://www.holoweb.net/~simpl/">&simpl;</ulink>
(<emphasis>Synchronous Interprocess Messaging Project for
Linux</emphasis>) project offers a free software implementation of this
form of message passing.
</para>

<para>
&posix; has standardized an &api; for messages (with the semantics
of what was called &ldquo;mailboxes&rdquo; above, i.e., the message
queues are managed by the operating system). Here are the basic
data structures and prototypes:
<programlisting>
<![CDATA[
struct mq_attr {
    long mq_maxmsg;    // Maximum number of messages in queue
    long mq_msgsize;   // Maximum size of a message (in bytes)
    long mq_flags;     // Blocking/Non-blocking behaviour specifier
                       //   not used in mq_open only relevant
                       //   for mq_getattrs and mq_setattrs
    long mq_curmsgs;   // Number of messages currently in queue
};

   // Create and/or open a message queue:
mqd_t mq_open(
  char *mq_name,
  int oflags,
  mode_t permissions,
  struct mq_attr *mq_attr
);

   // Receive a message:
size_t mq_receive(
  mqd_t mq,
  char *msg_buffer,
  size_t buflen,
  unsigned int *msgprio
);

   // Send a message to a queue
int mq_send(
  mqd_t mq,
  const char *msg,
  size_t msglen,
  unsigned int msgprio
);

   // Close a message queue:
int mq_close(mqd_t mq);

   // Get the attributes of a message queue:
int mq_setattr(mqd_t mq, const struct mq_attr *new_attrs,
                                struct mq_attr *old_attrs);

   // Register a request to be notified whenever a message 
   // arrives on an empty queue:
int mq_notify(mqd_t mq, const struct sigevent *notification);

   // Destroy a message queue:
int mq_unlink(char *mq_name);
]]>
</programlisting>
</para>
</sect1>


<sect1 id="sect-circ-buf">
<title>Circular buffers</title>
<para>
A circular<indexterm><primary>circular buffer</primary></indexterm>
<indexterm>
<primary>buffer</primary><secondary>circular</secondary>
</indexterm>(or ring)<indexterm>
<primary>ring buffer</primary></indexterm>
<indexterm>
<primary>buffer</primary><secondary>ring</secondary></indexterm>
buffer has most of the properties of shared memory, except that (i)
its depth is larger than one (i.e., it can
contain more than one of the data structures exchanged in the
communication).  The buffer is usually implemented as an array of
communication data structures, and the positions of sender and
receiver are indicated by pointers in this array.  When one of these
pointers reaches the end of the buffer, it swaps back to the start of
the buffer and continues from there. So, data is lost when the sender
pointer overtakes the reader pointer; data is read multiple times if
the reader pointer overtakes the writer pointer. It's
straightforward to use a lock to avoid these situations. In that case,
the lock makes the buffer <emphasis>blocking</emphasis>. A lock can
also be set on each data item in the buffer, in order to avoid
concurrent access of the same data.
</para>
<para>
Two common options for buffers (especially in real-time applications)
are:
<itemizedlist>
 <listitem>
 <para>
<emphasis>Locking in memory.</emphasis> The memory used for the buffer
should not be swapped out of the physical RAM.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>&ldquo;Buffer Half Full&rdquo;</emphasis><indexterm>
<primary>buffer half full</primary></indexterm>
<emphasis>(High water/Low water) interrupt.</emphasis><indexterm>
<primary>high water</primary></indexterm>
<indexterm><primary>low water</primary></indexterm>
<indexterm><primary>buffer watermark</primary></indexterm>
<indexterm><primary>watermark</primary></indexterm>
The sender and/or receiver tasks can raise an event if the
buffer is more than half full or half empty. This event must
wake up the other part of the &ipc;, such that it can take the
appropriate actions to prevent the buffer from overflowing or getting
empty.
 </para>
 </listitem>
</itemizedlist>
</para>

</sect1>


<sect1 id="sect-swinging-buf">
<title>Swinging buffers</title>
<para>
A swinging buffer<indexterm>
<primary>swinging buffer</primary></indexterm>
 (or &ldquo;flip buffer&rdquo;)<indexterm>
<primary>flip buffer</primary></indexterm>
<indexterm><primary>buffer</primary><secondary>flip</secondary>
</indexterm>
<indexterm><primary>buffer</primary><secondary>swinging</secondary>
</indexterm>
is two things:
<itemizedlist>

 <listitem>
 <para>
<emphasis>An advanced circular buffer.</emphasis> Instead of using one
single shared memory array, a swinging buffer uses two or more. The
sender fills up one of the buffers, while the receiver empties another
one. Every time one of the tasks reaches the end of its buffer, it
starts operating on a buffer that the other task is not using.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>A deadlock-free &ldquo;lock.&rdquo;</emphasis>
Both tasks operate on different data structures, hence no locks are
used to access the data. Only when the tasks must decide which buffer
to use, they use a lock on the buffer pointers, or the corresponding
atomic pointer switching operation
(see <xref linkend="sect-lock-free">). In this latter case, the
&ldquo;lock&rdquo; is atomic in hardware, and hence cannot cause any
of the problems generated by software locks.
 </para>
 </listitem>

</itemizedlist>
So, a swinging buffer is <emphasis>non-blocking</emphasis> but
<emphasis>loss-prone</emphasis>, because one task can fill or empty
the same buffer of the swinging buffer pair multiple times before the
other task is ready to switch buffers.
</para>
<para>
The swinging buffer approach is also known under the name of
<emphasis>read-copy-update<indexterm>
<primary>read-copy-update</primary></indexterm>
<indexterm>
<primary>swinging buffer</primary>
<secondary>read-copy-update</secondary>
</indexterm>
(<ulink
 url="http://lse.sourceforge.net/locking/rcu/rcupdate_doc.html">RCU</ulink>
<indexterm><primary>RCU</primary></indexterm>)</emphasis>.
It can be used as an alternative for <emphasis>read-write</emphasis>
locks (<xref linkend="sect-readwrite-lock">) for
&ldquo;frequent reads/infrequent writes&rdquo; applications: the
readers follow a pointer, and need no locks, while the (less frequent)
writer swaps the pointers after having filled in the new data
structure.
</para>
</sect1>


<sect1 id="sect-rpc">
<title>Remote Procedure Calls</title>
<para>
The previous flavours of &ipc; can all be catalogued as
&ldquo;low-level&rdquo;: they are implemented with very basic OS
primitives, and are usually shielded from the users within system calls.
One of the popular &ipc; mechanisms at the user level are <emphasis>Remote
Procedure Calls</emphasis> (&rpc;). With &rpc;, a user can invoke the
execution of a task on a remote computer, as if that task ran on the
processor of the calling task. &rpc; is implemented on top of messages,
with a synchronizing hand-shake protocol. Obviously, &rpc; is not very
real-time, but could be useful for embedded systems.
</para>
<para>
On the other hand, &rpc; is the simplest form of what is also called
<emphasis>distributed</emphasis> or
<emphasis>embedded components</emphasis>: software objects
(&ldquo;<emphasis>agents</emphasis>&rdquo;) that can live on any computer
on a network, and that tasks can access transparently. There are three
major standards in the area of distributed components:
<itemizedlist>
 <listitem>
 <para>
<emphasis><ulink url="http://www.corba.org">&corba;</ulink>
 (Common Object Request Broker Architecture)</emphasis>.<indexterm>
<primary>CORBA</primary></indexterm>
This is a fully platform and vendor independent initiative.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>&dcom;</emphasis>,<indexterm>
<primary>&dcom;</primary></indexterm> controlled by Microsoft.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis><acronym>RMI</acronym><indexterm>
<primary>RMI</primary></indexterm>
(Remote Method of Invocation)</emphasis>,<indexterm>
<primary>Remote Method of Invocation</primary></indexterm>
from the &java; world.
 </para>
 </listitem>
</itemizedlist>
A real-time extension to &corba; has been specified in 2001. It takes care
of the <emphasis>specification</emphasis> of the determinism required
for real-time applications, but needs to map its functionality onto
primitives offered on the host operating system. Of course, the
absolute time scales of &corba; real-time are longer than those of a
stand-alone computer system.
</para>

<para>
(TODO: more details about &corba;, and real-time &corba;.)
</para>

</sect1>

</chapter>


<chapter id="memory-management">
<title>Memory management</title>
<para>
This Chapter explains what <emphasis>memory management</emphasis> means,
and how it influences the real-time behaviour of an operating system.
Non real-time aspects of memory management (virtual
memory, swapping, dirty pages management, etc.) are outside the scope
of this document.
</para>


<sect1 id="sec-mem-general">
<title>Terminology</title>
<para>
All tasks need RAM memory to execute. Not only for placing their data, but
also for their code and for &ipc; with other tasks. A computer system
offers a (most often) contiguous space of physical RAM, and the
&mmu;<indexterm><primary>&mmu;</primary></indexterm>
(<emphasis>Memory Management Unit</emphasis>)<indexterm>
<primary>Memory Management Unit</primary></indexterm>
of the hardware, and the <emphasis>Virtual Memory</emphasis><indexterm>
<primary>virtual memory</primary></indexterm>
software of the operating system, help to give a task the impression
that it is the only one that uses the memory. And
that that memory is
(i) larger than the physically available RAM;
(ii) distributed (<emphasis>transparantly to the task</emphasis>) over
a number of physically non-contiguous memory
<emphasis>pages</emphasis> of fixed size; and
(iii) protected from access by other tasks.
</para>

<para>
But these general-purpose OS requirements are not those of real-time
systems, or of embedded systems on processors without &mmu;.  Their
concerns are:
<itemizedlist>
 <listitem>
 <para>
<emphasis>Fast and deterministic memory management.</emphasis> The
fastest and most deterministic approach to memory management is no
memory management at all. This means that the programmers have all
physical RAM available as one contiguous block that they can use as
they like. This approach is usually only an option for small embedded
systems that run a fixed and small number of tasks. Other &rtos;s and
&eos;s offer at least the basic memory management: memory allocation
and deletion through system calls.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Page locking.</emphasis><indexterm>
<primary>page locking</primary></indexterm>
Demand paging<indexterm><primary>demand paging</primary></indexterm>
is the common approach in general purpose operating systems to
distribute the scarce physical RAM over all tasks: each task gets a
number of pages in RAM, and the pages it hasn't accessed recently are
&ldquo;swapped out&rdquo; to make room for pages of other tasks. This
swapping is a non-deterministic thing, because it needs access to
disk, and most disk controllers have non-deterministic buffering for
optimising the average throughput to or from the disk: when the task
needs code or data from one of its pages that is currently swapped
out, the page has to be retrieved from disk, and often another page in
RAM has first to be swapped out to disk.  Hence, the &mmu; of an
&rtos; <emphasis>must</emphasis> lock the pages of real-time tasks in
the physical RAM, in order to avoid the paging overhead. &posix;
provides the <function>mlock()</function><indexterm>
<primary><function>mlock()</function></primary></indexterm>
and <function>mlockall()</function><indexterm>
<primary><function>mlockall()</function></primary></indexterm>
function calls to do this locking. 
 </para>
 <para>
Page locking is a <emphasis>Quality of Service</emphasis><indexterm>
<primary>&qos;</primary></indexterm>
<indexterm><primary>Quality of Service</primary></indexterm>
feature of the operating system: it guarantees that tasks have a
specified amount of the memory resource at their disposal. In this
respect, it is similar to the &qos; extensions of the scheduler
(see <xref linkend="rtos-time-constraints">).
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Dynamic allocation.</emphasis><indexterm>
<primary>dynamic allocation</primary></indexterm>
A task's memory needs can change during its lifetime, such that it
should be able to ask the operating system for more memory. The
&linux; system call for this purpose is
<function>vmalloc()</function>.<indexterm>
<primary><function>vmalloc()</function></primary></indexterm>
 (In kernel space!) A real-time memory manager
can only make this dynamic allocation of memory deterministic, if the
memory pages can be got from a <emphasis>pool</emphasis><indexterm>
<primary>pool</primary><secondary>memory pages</secondary></indexterm>
of free pages locked in the physical RAM. Anyway, dynamic allocation
should be used very carefully in any real-time task, because there is
no guarantee that the memory pool has enough free pages left to
satisfy all requests. This implies, for example, that &ipc; approaches
with dynamic memory allocation needs (such as unlimited mailboxes and
messages, see <xref linkend="sect-mess">) are to be avoided.
 </para>
 <para>
Nothing prevents an operating system from allocating memory in smaller
blocks than one page. However, finer and variable-sized granularity
implies more complex memory management, memory
<emphasis>fragmentation</emphasis>,<indexterm>
<primary>fragmentation</primary></indexterm>
<indexterm>
<primary>memory</primary><secondary>fragmentation</secondary>
</indexterm>
and hence less determinism.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Memory mapping.</emphasis><indexterm>
<primary>memory mapping</primary></indexterm>
<indexterm>
<primary>mapping</primary><secondary>memory</secondary>
</indexterm>
Real-time and embedded systems typically have to access peripheral
devices. The on-board registers in which these devices place their
data have to be <emphasis>mapped</emphasis> somewhere into the address
space of the corresponding device driver task. The &posix; system call
to do this mapping is <function>mmap()</function>. Typically, this
mapping is a configuration activity, and hence need not be done in
real-time.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Memory sharing.</emphasis><indexterm>
<primary>memory sharing</primary></indexterm>
<indexterm>
<primary>sharing</primary><secondary>memory</secondary>
</indexterm>
One of the most efficient ways for tasks to communicate is through
shared memory (see <xref linkend="sec-shared-mem-linux">). The
operating system has two major responsibilities in this area: (i)
(de)allocation of the shared memory, and (ii) synchronizing access to
that memory by different tasks. The latter topic is discussed in 
<xref linkend="ipc-synch">; the former is illustrated later in this
Chapter.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>RAM disks.</emphasis><indexterm>
<primary>RAM disk</primary></indexterm>
In order to avoid the non-deterministic overhead of accessing hard
disks (for real-time systems) or the extra cost, extra space, and
reduced robustness of mechanical disk devices (for embedded systems),
part of the available RAM can used to <emphasis>emulate</emphasis> a hard
disk.  This means that that memory is organized and accessed as a
<emphasis>file system</emphasis>, as if it would reside on a hard disk.
</para>
<para>
When the RAM disk should be able to preserve data when the power is
switched off, the embedded system designer implements it in the form
of a <emphasis>flash disk</emphasis>. This is memory that can be
&ldquo;burned&rdquo; many thousand times, rather quickly, with very
little power, and from within the system code itself. Reburning
(&ldquo;flashing&rdquo;) is required either for reprogramming the
device, or for temporary storage of &ldquo;non-volatile&rdquo; data.
</para>
<para>
Having the system code in a file system on flash gives the added bonus
that the code need not be loaded into RAM, but may be executed in
place. This results in shorter start-up times.
 </para>
 </listitem>
 <listitem>
 <para>
<emphasis>Stripped libraries.</emphasis> RAM is a scarce resource in
real-time and embedded systems, such that programmers try to use as
little of it as possible. Hence, they often use &ldquo;stripped
down&rdquo; versions of general utility libraries (&ccc; library, math
library, GUI library, etc.).
<application>&mu;libc</application> is such a low-footprint version of
the &ccc; library.
 </para>
 </listitem>
</itemizedlist>
</para>

</sect1>

<sect1 id="sec-shared-mem-linux">
<title>Shared memory in Linux</title>
<para>
(TODO: update state of affairs on shared memory! &posix; &api; for
shared memory; sharing between real-time and user space; shared memory
managment through locks and/or monitor;
<function>copy_to_user()</function>,
<function>copy_from_user()</function>.)
</para>

<para>
This Section discusses two complementary ways to allocate shared
memory in &linux;. There is nothing particularly real-time about
<emphasis>using</emphasis> shared memory;
<emphasis>allocating</emphasis> shared memory, however, is more
controversial: &rtlinux; doesn't allow to allocate memory on-line,
&rtai; does.
</para>

<para>
The shared-memory pool is a block of physical memory set aside at boot
time so that &linux; does not use it for processes. To set up the pool,
you first determine how much physical memory the system has and how
much is to be used for shared memory. 

Normal &linux; processes are required to map physical memory into their
private address space to access it. To do this, the Linux processes
calls <function>open()</function> on the memory device
<filename>/dev/mem</filename>.<indexterm>
<primary>/dev/mem</primary></indexterm>
After the file descriptor is opened,
the &linux; process maps the shared memory into its address space
using <function>mmap()</function>, which returns a pointer to the
shared memory as mapped into the &linux; process's address space. Once
the shared memory is mapped, it may be accessed by dereferencing the
pointer. When the process terminates, you use
<function>munmap()</function> to unmap the shared memory by passing
the pointer and the size of its object.  Shared-memory access is
easier in the kernel space of the real-time &linux; variants, since
the real-time code executes in kernel space and thus is not required
to map physical addresses to virtual addresses.
</para>

<sect2 id="sec-shared-mem-linuxI">
<title>Allocation at boot time</title>
<para>
In this approach, a block of shared memory can be reserved at
<emphasis>boot time</emphasis>, to prevent &linux; from using it for
general purposes. The reservation is done using the
<parameter>append=</parameter> parameter in &lilo; (or something
similar for other bootloaders). Here is an example
of a <filename>/etc/lilo.conf</filename> file that reserves 1 Megabyte
for shared memory out of 16 available Megabytes:
<programlisting>
<![CDATA[
image=/boot/zImage
label=rtlinux
root=/dev/hda1
append="mem=15m"
]]>
</programlisting>
&linux; will use only the <emphasis>first</emphasis> 15 Megabytes, and the
last Megabyte can be used for shared memory purposes. The 
<emphasis>base address</emphasis> of the shared memory in the
above-mentioned example is:
<programlisting>
#define BASE_ADDRESS (15 * 0x100000)
</programlisting>
The real-time and user &linux; tasks use different ways to access the
memory.
</para>
<para>
A real-time task runs in kernel space, and hence can directly access the
memory with its <emphasis>physical address</emphasis>. For example, 
a data structure of type <parameter>my_data</parameter> at the start of
the shared memory is accessed as:
<programlisting>
<![CDATA[
my_data *ptr;

ptr = (my_data *) BASE_ADDRESS;
ptr->... = ...
]]>
</programlisting>
</para>
<para>
A user space tasks
must use its <emphasis>virtual address</emphasis>. This mapping of physical
memory into the virtual address space consists of two steps:
<itemizedlist>
 <listitem>
 <para>
The user space task must &ldquo;open&rdquo; the memory, by using the
<function>open()</function> system call on the device
<filename>/dev/mem</filename>:
<programlisting>
<![CDATA[
#include <unistd.h>     // POSIX defined open()
#include <fcntl.h>      // O_RDWR for read and write access
                        // or O_RDONLY for read-only access, etc.

int fd;                 // file descriptor for the opened memory

if ((fd = open("/dev/mem", O_RDWR)) < 0 )) {
       // handle possible error here
}
]]>
</programlisting>
 </para>
 </listitem>
 <listitem>
 <para>
The <function>mmap()</function> system call then does the actual mapping.
<programlisting>
<![CDATA[
my_data *ptr;

ptr = (my_data *) mmap (0, sizeof(my_data),
                        PROT_READ | PROT_WRITE,
                        MAP_SHARED,
                        fd, BASE_ADDRESS);
]]>
</programlisting>
The parameters <parameter>PROT_READ</parameter> and
<parameter>PROT_WRITE</parameter> are &posix;-defined and indicate
read and write access; <parameter>MAP_SHARED</parameter> indicates
that memory can be shared with any other task that maps it too. (See
the man pages for more details.)
 </para>
 </listitem>
</itemizedlist>
The shared memory is then accessed via the pointer, as in the kernel space
task.
<programlisting>
<![CDATA[
my_data *ptr;

ptr->... = ...
]]>
</programlisting>
</para>
<para>
The task must use <function>munmap()</function> to un-map the shared
memory used for the <parameter>my_data</parameter> data structure:
<programlisting>
<![CDATA[
my_data *ptr;

munmap(ptr, sizeof(my_data));
]]>
</programlisting>
</para>

</sect2>


<sect2 id="sec-shared-mem-linuxII">
<title>Allocation in kernel space</title>
<para>
The mbuff<indexterm>
<primary>mbuff</primary></indexterm>
module implements the <filename>/dev/mbuff</filename><indexterm>
<primary>/dev/mbuff</primary></indexterm>
 device. This device offers shared memory (allocated in the kernel
using the <function>vmalloc</function>) in kernel as well as in user
space. The shared memory does not need to be reserved at the system
startup and its size is not limited by memory fragmentation. It is
logically (but not physically) contiguous, and is locked in the
physical RAM.  When you allocate a block, the kernel first grabs the
free pages, then if
there is not enough of them, starts freeing more, by reducing buffers,
disk cache and finally by swapping out to disk some user data and code.
For sure this is not a real-time operation&mdash;it may take 
seconds to get something like 100 MB out of 128 MB RAM machine. 
</para>

</sect2>


<sect2 id="sec-shared-mem-linuxIII">
<title>Allocation in module</title>
<para>
(TODO: latest kernel options for memory allocation; dmaBuffer module.
Use this approach preferably at boot time, otherwise you might not be
able to find all the requested memory as a contiguous area in RAM.)
</para>
<para>
(TODO: <function>copy_from_user()</function>.)
</para>

</sect2>


</sect1>


</chapter>


<chapter id="devicedriver">
<title>Real-time device drivers</title>

<para>
<indexterm><primary>device driver</primary></indexterm>
An operating system must interface its peripheral devices to its
kernel software as well as to the user application software.  This
should be done in a modular and systematic way, such that all hardware
&ldquo;looks the same&rdquo; to software applications. The software
that takes care of this hardware-independent interfacing are
<emphasis>device drivers</emphasis>. For the &linux; real-time
variants, &comedi; (<xref linkend="sec-dev-comedi">)
is a successful and steadily growing project for
real-time and non real-time device drivers for digital acquisition
cards.
</para>

<para>
In the &unix; world, device drivers are
visible through the <filename>/dev/xyz</filename> &ldquo;files&rdquo;
(where <filename>xyz</filename> stands for a
particular device, such as, for example, <filename>hda</filename> for
the first hard disk, <filename>ttyS0</filename> for the first serial
line, etc.). The 2.4.X kernels have introduced the
<filename>devfs</filename><indexterm>
<primary>devfs</primary></indexterm>
 and
<application>driverfs</application><indexterm>
<primary>driverfs</primary></indexterm>
(&ldquo;driver file system&rdquo;)
approaches, which give more structure to the information about the
devices that have actually been loaded. But all these things are for
<emphasis>user space</emphasis>, and hence not relevant for the
real-time &linux; variants that operate in kernel space.
</para>
<para>
The bookkeeping aspects of registering a device driver, with major and
minor numbers, as well as guidelines for writing device drivers,
are explained in detail in the &unix; literature. For the &linux;
example, Rubini's <emphasis>Linux Device Drivers</emphasis> book
(<citation>Rubini2001</citation>) is the major reference.
</para>


<sect1 id="sect-mech-policy">
<title>Mechanism and policy</title>
<para>
A major feature of a good device driver is that it
&ldquo;provides mechanism, not policy.&rdquo;<indexterm>
<primary>mechanism</primary></indexterm>
<indexterm><primary>policy</primary></indexterm>
This means that it should faithfully mimic all the interfacing
capabilities of the device (the &ldquo;mechanism&rdquo;), but nothing
more. It should <emphasis>not</emphasis> try to interpret the
exchanged data in any possible user context (the
&ldquo;policy&rdquo;), because that is the job of that user
application program itself. Indeed, once a device driver offers a
software interface to the mechanism of the device, an application
writer can use this mechanism interface to use the device in
<emphasis>one particular way</emphasis>. That is, some of the data
stuctures offered by the mechanism are interpreted in specific
physical units, or some of them are taken together because this
composition is relevant for the application. For example, a analog
output card can be used to generate voltages that are the inputs for
the electronic drivers of the motors of a robot; these voltages can be
interpreted as setpoints for the desired velocity of these motors, and
six of them are taken together to steer one particular robot with
six-degrees of freedom. Some of the other outputs of the same physical
device can be used by another application program, for example to
generate a sine wave that drives a vibration shaker.
Or, the robot control program can use a force sensor that is
interfaced through a serial line. The force sensor device driver
&ldquo;talks&rdquo; to both the application program (i.e., the force
control algorithm), and the serial line device driver (for which it is
a &ldquo;user application&rdquo; itself!). It is obvious that the
serial line driver should never implement function calls that are only
useful in the force sensor driver context. Nevertheless, that's
exactly what happens in many projects with constrained scope, vision
and time&hellip;
</para>

<para>
As for the other operating system responsibilities discussed in the
previous Chapters, writing device drivers for an &rtos; or an &eos; is
not so much different from writing them for a general-purpose OS.
Basically, in an &rtos; context, one should make sure that all timing
delays in the drivers are both <emphasis>short</emphasis> and
<emphasis>deterministic</emphasis>, and every &dsr; should be an
appropriately prioritized thread or handler<indexterm>
<primary>handler</primary></indexterm>
that waits on an event to become active.
</para>

</sect1>


<sect1 id="devdriver-unix">
<title>Device drivers in UNIX</title>
<para>
In the &unix; philosophy, all devices are considered as being
&ldquo;files&rdquo;, and hence, their device drivers share the
following functions:
<function>open()</function>,
<function>close()</function>,
<function>read()</function>,
<function>write()</function>,
<function>read_config()</function>,
<function>set_config()</function>. The function call names are
operating system independent, and just
for demonstration. However, <function>open()</function>,
<function>close()</function>,
<function>read()</function> and
<function>write()</function>, are &posix; compliant. The
configuration function calls are, in &unix; often taken together in
the <function>ioctl()</function><indexterm>
<primary><function>ioctl()</function></primary></indexterm>
function.
</para>

<para>
<function>open()</function> makes the device accessible for programs,
while <function>close()</function> ends the accessibility. The device
can be opened in different modes, such as, for example,
<parameter>O_RDONLY</parameter> (&ldquo;read-only&rdquo;)
<parameter>O_WRONLY</parameter>, (&ldquo;write-only&rdquo;),
<parameter>O_RDWR</parameter> (&ldquo;read and write&rdquo;), and
<parameter>O_NONBLOCK</parameter> (&ldquo;non-blocking&rdquo;).
</para>

<para>
<function>read()</function> and <function>write()</function>
interchange data between the peripheral device and the (kernel or
application) software: a known datastructure is copied from one place
in memory to another. Of course, the exact contents of that data
structure depends on the device and/or on the particular use of this
device by the programmer. 
</para>

<para>
<function>read_config()</function> reads out the device's current
configuration status, and <function>set_config()</function> programs
that configuration.  Configuration management often has less strict
timing constraints than reading and writing. It also has less
standardized function calls, because of the larger variety in possible
settings of different hardware. Nevertheless, the &posix;
standard prescribes the use of the <function>ioctl()</function>
function call, for all configuration actions that don't fit cleanly in
the classic &unix; stream I/O model of 
<function>open()</function>, <function>close()</function>,
<function>read()</function>, and <function>write()</function>:
<programlisting>
<![CDATA[
int ioctl(int d, int request, ...)
]]>
</programlisting>
The <parameter>d</parameter> parameter is the &ldquo;file
descriptor&rdquo; with which the device has been opened;
<parameter>request</parameter> is the particular configuration
identifier; and <parameter>...</parameter> are possible arguments that
come with the <parameter>request</parameter>.
</para>

</sect1>


<sect1 id="devdriver-isr-dsr">
<title>Complex device drivers</title>
<para>
A simple device driver need nothing more than writing and/or reading of
some hardware registers on a peripheral device.
Some devices interact with the software through hardware interrupts.
Hence, their device drivers must include an &isr;, and possibly also a
&dsr;<indexterm>
<primary>&isr;</primary></indexterm>
<indexterm><primary>&dsr;</primary></indexterm>
(see <xref linkend="sect-idsr">).
Recall that only a subset of all kernel space functions
are available in the run-time context of an &isr;. And a real-time
device driver is subjected to even more constraints.
</para>
<para>
Almost all devices can be interfaced in
<emphasis>Programmed Input/Output<indexterm>
<primary>programmed IO</primary></indexterm>
 (PIO)<indexterm><primary>PIO</primary></indexterm></emphasis> mode:
the processor is responsible for accessing bus addresses allocated to
the device, and to read or write data. Some devices also allow shared
memory, or even <emphasis>Direct Memory Access<indexterm>
<primary>Direct Memory Access</primary></indexterm>
 (DMA)<indexterm><primary>DMA</primary></indexterm></emphasis>:
the device and the memory exchange data amongst each other directly,
without needing the processor. DMA is a feature of the bus, not of the
operating system; the operating system, however, must support its
processes to use the feature, i.e., provide a system call to
initialize DMA transfer, and a handler to react to the notification of
the device that it has finished its DMA. Anyway, support for shared
memory and DMA makes a device driver again a bit more complex.
</para>

<para>
From the point of view of system developers, it is worthwhile, in the
case of complex devices or systems with lots of devices, to
standardize the structure and the &api; for the device drivers as much
as possible:
<itemizedlist>

<listitem>
<para>
<emphasis>&api;</emphasis>: devices that offer similar
mechanism, should have the same software interface, and their
differences should be coped with by parameterizing the interfaces, not
by changing the interface for each new device in the family.
</para>
</listitem>
<listitem>
<para>
<emphasis>Structure</emphasis>: many electronic interfaces have more
than one layer of functionality between the hardware and the operating
system, and the device driver code should reflect this fact. For
example, many different interface cards use the same &pci; driver chips,
or use the parallel port to connect to the hardware device. Hence,
providing &ldquo;low-level&rdquo; device drivers for these &pci; chips
and parallel ports allows for an increased modularity and
re-useability of the software. And the mechanism of the low-level
drivers is used with different policies in the various higher-level
drivers.
</para>
</listitem>
</itemizedlist>
</para>

</sect1>


<sect1 id="sec-dev-comedi">
<title>Comedi</title>

<para>
David Schleef<indexterm>
<primary>David Schleef</primary></indexterm><indexterm>
<primary>Schleef</primary><secondary>David</secondary></indexterm>
started the 
<ulink url="http://stm.lbl.gov/comedi/">Comedi</ulink> 
project to interface lots of different cards for measurement and
control purposes. This type of cards are often called
<emphasis>Data Acquisition</emphasis><indexterm>
<primary>data acquisition</primary></indexterm>
cards, or DAQ cards.<indexterm><primary>DAQ</primary></indexterm>
Schleef designed a structure which is a balance between
<emphasis>modularity</emphasis> (i.e., it's fairly easy to integrate a
new card because most of the infrastructure part of the driver can be
reused) and <emphasis>complexity</emphasis> (i.e., the structure
doesn't present so much overhead that new contributors are scared away
from writing their new drivers within the &comedi; framework).
</para>
<para>
&comedi; works with a standard &linux; kernel, but also with its
real-time extensions <link linkend="sect-rtai">&rtai;</link> and
<link linkend="sect-rtlinux">&rtlinux;</link>.
</para>
<para>
The &comedi; project consists of two packages, and three parts:
the &ldquo;<application>comedi</application>&rdquo; package contains
the drivers, and the <application>kcomedilib</application> kernel
module for &linux; (which is an library to use the drivers in
real-time); the
&ldquo;<application>comedilib</application>&rdquo; package implements
the user space access to the device driver functionality.
</para>

<para>
The cards supported in &comedi; have one or more of the following
features: <emphasis>analog input</emphasis> channels,
<emphasis>analog output</emphasis> channels,
<emphasis>digital input</emphasis> channels, and
<emphasis>digital output</emphasis> channels. The digital channels are
conceptually quite simple, and don't need much configuration: the
number of channels, their addresses on the bus, and their direction
(input/output).
</para>
<para>
The analog channels are a bit more complicated. Typically, an analog
channel can be programmed to generate or read a voltage between a
lower and an upper threshold (e.g., -10V and +10V); the card's
electronics can be programmed to automatically sample a set of
channels, in a prescribed order; to buffer sequences of data on the
board; or to use DMA to dump the data in an available part of memory,
without intervention from the processor.
</para>
<para>
Many interface cards have extra functionality, besides the analog and
digital channels. For example, an EEPROM<indexterm>
<primary>EEPROM</primary></indexterm>
 for configuration and board
parameters, calibration inputs, counters and timers, encoders (=
quadrature counter on two channels), etc. Therefore, &comedi; offers
more than just analog and digital data acquisition.
</para>
<para>
The kernel space structures that &comedi; uses have the following
hierarchy:
<itemizedlist>
 <listitem>
<para>
<emphasis>channel<indexterm><primary>channel</primary></indexterm>
</emphasis>: the lowest-level component, that represents the
properties of one single data channel (analog in or out; digital in or
out). Each channel has parameters for: the voltage range, the
reference voltage, the channel polarity (unipolar,<indexterm>
<primary>unipolar</primary></indexterm> bipolar<indexterm>
<primary>bipolar</primary></indexterm>), a conversion factor between
voltages and physical units.
</para>
 </listitem>
 <listitem>
<para>
<emphasis>sub-device</emphasis>:<indexterm>
<primary>sub-device</primary></indexterm>
a set of functionally identical channels that are physically
implemented on the same (chip on an) interface card. For example, a
set of 16 identical analog outputs.  Each sub-device has parameters
for: the number of channels, and the type of the channels.
</para>
 </listitem>
 <listitem>
<para>
<emphasis>device</emphasis>:<indexterm>
<primary>device</primary></indexterm>
a set of sub-devices that are physically implemented on the same
interface card; in other words, the interface card itself.  For
example, the <application>National Instruments 6024E</application>
device has a sub-device with 16 analog input channels, another
sub-device with two analog output channels, and a third sub-device
with eight digital inputs/outputs.  Each device has parameters for:
the device identification tag from the manufacturer, the
identification tag given by the operating system (in order to
discriminate between multiple interface cards of the same type), the
number of sub-devices, etc.
</para>
 </listitem>
</itemizedlist>
</para>
<para>
The basic functionalities offered by Comedi are:
<itemizedlist>
 <listitem>
<para>
<emphasis>instruction</emphasis>:<indexterm>
<primary>instruction</primary></indexterm>
to synchronously perform one single data acquisition on a specified
channel, or to perform a configuration on the channel.
&ldquo;Synchronous&rdquo;<indexterm><primary>synchronous</primary>
</indexterm> means that the calling process blocks until the data
acquisition has finished.
</para>
 </listitem>
 <listitem>
<para>
<emphasis>scan</emphasis>:<indexterm>
<primary>scan</primary></indexterm>
repeated instructions on a number of different channels, with a
programmed sequence and timing.
</para>
 </listitem>
 <listitem>
<para>
<emphasis>command</emphasis>:<indexterm>
<primary>command</primary></indexterm>
start or stop an autonomous (and hence asynchronous<indexterm>
<primary>asynchronous</primary></indexterm>) data acquisition
(i.e., a number of scans) on a specified set of channels.
&ldquo;Autonomous&rdquo; means: without interaction from the software,
i.e., by means of on-board timers or possibly external triggers.
</para>
 </listitem>
</itemizedlist>
This command functionality is not offered by all DAQ cards.  When
using &rtai; or &rtlinux;, the command functionality is emulated
through the <function>comedi_rt_timer</function> virtual driver.
The command functionality is very configurable, with respect to the
choice of events with which to signal the progress of the programmed
scans: external triggers, end of instruction, etc.
</para>
<para>
&comedi; not only offers the API to access the functionality of the
cards, but also to query the capabilities of the installed &comedi;
devices. That is, a user process can find out on-line what channels
are available, and what their physical parameters are (range,
direction of input/output, etc.).
</para>
<para>
&comedi; contains more than just procedural function calls: it also
offers event-driven functionality. The data acquisition can signal
its completion by means of an interrupt or a
<emphasis>callback</emphasis><indexterm>
<primary>callback</primary></indexterm> function call.  Callbacks are
also used to signal errors during the data acquisition or when writing
to buffers, or at the end of a scan or acquisition that has been
launched previously to take place asynchronously (i.e., the card fills
up som shared memory buffer autonomously, and only warns the user
program after it has finished).
</para>
<para>
The mechanisms for synchronization and interrupt handling are a bit
different when used in a real-time context (i.e., with either &rtai; or
&rtlinux;), but both are encapsulated behind the same &comedi;
calls.
</para>
<para>
Because multiple devices can all be active at the same time, &comedi;
provides (non-SMP!) locking primitives to ensure atomic operations on
critical sections of the code or data structures.
</para>
<para>
Finally, &comedi; offers the above-mentioned &ldquo;high-level&rdquo;
interaction, i.e., at the level of user space device drivers, through
file operations on entries in the <filename>/dev</filename> directory
(for access to the device's functionality), or interactively from the
command line through the &ldquo;files&rdquo; in the
<filename>/proc</filename> directory (which allow to inspect the
status of a &comedi; device). This high-level interface resides in the
&ldquo;comedilib&rdquo; tarball, which is the user space library, with
facilities to connect to the kernel space drivers residing in the
&ldquo;comedi&rdquo; tarball.
</para>

<sect2 id="sec-comedi-write">
<title>Writing a Comedi device driver</title>
<para>
(TODO: describe series of steps.)
</para>
</sect2>

</sect1>

<sect1 id="sect-rtcom">
<title>Real-time serial line</title>
<para>
A real-time device driver for the serial lines<indexterm>
<primary>serial line</primary></indexterm> is integrated into
&rtai;.
There used to be an independent project,
<ulink url="http://rt-com.sourceforge.net/">rt_com<indexterm>
<primary>rt_com</primary></indexterm></ulink>, but the developers
joined the &rtai; bandwagon, and the code was thoroughly rewritten.
(Under supervision of the &comedi; maintainer, David Schleef.)
</para>
<para>
The &rtai; device driver resides in the &ldquo;SPdrv<indexterm>
<primary>SPdrv</primary></indexterm>&rdquo; (Serial Port<indexterm>
<primary>serial port</primary></indexterm> driver) sub-directory of
&rtai;. It provides very configurable address initialization,
interrupt handling, buffering, callbacks, and non-intrusive buffer
inspection. It's a nice purely &ldquo;mechanism&rdquo; device driver.
</para>

</sect1>

<sect1 id="sect-rtpar">
<title>Real-time parallel port</title>
<para>
A real-time device driver for the parallel port<indexterm>
<primary>parallel port</primary></indexterm> is integrated into
&comedi;. It's not much different from a user space driver, except
for the real-time interrupt handler that can be connected to the
interrupt that can be generated on pin 10 of the parallel port.
The driver does <emphasis>not</emphasis> support 
<emphasis>ECP/EPP</emphasis> parallel ports.
</para>

</sect1>


<sect1 id="sect-rtnet">
<title>Real-time networking</title>
<para>
The <emphasis>rtnet<indexterm>
<primary>rtnet</primary></indexterm></emphasis> project used to be
stand-alone, but is now also integrated into &rtai;. It provides a
<emphasis>common programming interface</emphasis> (real-time and
user space) between the &rtos; and the device drivers of ethernet
cards. Of course, TCP is not supported, due to its inherently
non-realtime specifications; UDP is supported.
</para>
<para>
Although about every possible ethernet card has a &linux; driver,
these cannot be used unchanged for hard real-time, because their
interrupt handling is not real-time safe. Only a couple of the most
popular cards are supported, and there is not much interest from the
community to port more drivers.
</para>

<para>
The <emphasis>CAN bus<indexterm>
<primary>CAN bus</primary></indexterm></emphasis>
is a two-wire bus with a 1Mbits/sec maximum transmission rate, that
has become very popular in many industries, such as the automotive. It
can be used for real-time, thanks to its CSMA/CD-NDBA bus arbitration
protocol. CSMA/CD-NDBA stands for <emphasis>Carrier Sense Multiple
Access with Collision Detect&mdash;Non-Destructive Bit
Arbitration.</emphasis> CSMA is also used for ethernet: all
clients of the bus sense what is happening on the bus, and stop
transmitting when they sense a collision of messages from different
clients. The CAN bus adds, in hardware, the NDBA part: this 
guarantees that the bit sent on the bus is not 
destroyed in a collision. In CAN the <emphasis>dominant bit</emphasis>
is the logical &ldquo;0&rdquo;, and it overrides the
<emphasis>recessive bit</emphasis> (logical &ldquo;1&rdquo;).
So the client that sends a dominant bit will see this dominant bit on
the bus, and can continue sending. Each client on the CAN bus has a
unique and statically defined identifier of 11 bits wide (29 bits in
the extended version of the standard), that corresponds to its
<emphasis>priority</emphasis>.  That means that the client with the
most dominant bits early on in its identifier will be the one that
survives the NDBA the longest, and hence it is the one that gets the
bus first.  So, the CAN bus implements
&ldquo;priority-based scheduling&rdquo; of its clients.
Due to the hardware limitations that must guarantee the
above-mentioned procedure of surviving dominant bits, a CAN bus has a
maximum length of about 100 meters.
</para>

<para>
The
<ulink url="http://www.cs.columbia.edu/~hgs/rtp/">Real-time Transport
Protocol (RTP)</ulink>
(<ulink url="ftp://ftp.isi.edu/in-notes/rfc1889.txt">RFC 1889</ulink>)
and the
<emphasis>Real-Time Publish-Subscribe</emphasis> protocol (drafted by Real-Time
Innovations, and to be adopted by the
<ulink url="http://www.ida-group.org">IDA</ulink>)
are &ldquo;policies&rdquo; on top of the <emphasis>normal</emphasis>
ethernet protocol. Hence, despite their names, they will at most be
soft real time.
</para>

</sect1>


</chapter>


<chapter id="chap-rtai">
<title>RTAI: the features</title>

<para>
This Chapter introduces the &rtai; real-time operating system, as an
illustration of the concepts and terminology introduced in the
previous Chapters. It describes which features are available in
&rtai;, and how the &api; looks like.  This Chapter doesn't aim to be
a reference or user manual of all &rtai; commands; you should look for
those manuals you on the &rtai; 
<ulink
 url="http://www.aero.polimi.it/~rtai/documentation/index.html">webpage.
</ulink>
</para>


<sect1 id="rtai-overview">
<title>Overview</title>
<para>
&rtai; consists of five complementary parts:
<orderedlist>

<listitem>
<para>
The <emphasis>HAL (Hardware Abstraction Layer)</emphasis>
provides an interface to the hardware, on top of which both
&linux; and the hard real-time core can run.
</para>
</listitem>

<listitem>
<para>
The <emphasis>&linux; compatibility layer</emphasis>
provides an interface to the &linux; operating system, with which
&rtai; tasks can be integrated into the &linux; task management,
without &linux; noticing anything.
</para>
</listitem>

<listitem>
<para>
<emphasis>&rtos; core.</emphasis>
This part offers the hard real-time functionality for task scheduling, 
interrupt processing, and locking. This functionality is not really
different from other real-time operating systems.
</para>
</listitem>

<listitem>
<para>
<emphasis>LX/RT (Linux Real-Time).</emphasis>
The modularity offered by a Hardware Abstraction Layer separated from
a core built on top of it, is used in other operating systems too,
e.g., &ecos;. The particular thing about &rtai; is the &lxrt;
component, that makes soft and hard real-time features available to
user space tasks in &linux;. &lxrt; puts a strong emphasis on
offering a <emphasis>symmetric</emphasis> real-time &api;: the same
real-time functionality should be useable with the same function calls
from user space as well as from kernel space. And also the &ipc; that
&lxrt; offers between user space and kernel space real-time tasks
works with a symmetric &api;.
</para>
</listitem>

<listitem>
<para>
<emphasis>Extended functionality packages.</emphasis> The core is
extended with useful extras, such as: several forms of inter-process
communication, network and serial line drivers; &posix; interface;
interfaces to domain-specific third-party toolboxes such as Labview,
&comedi; (<xref linkend="sec-dev-comedi">) and Real-Time Workshop;
software watchdogs; etc.
</para>
</listitem>

</orderedlist>
This Chapter explains what features are available in each of these
major &rtai; parts, as of the 24.1.9 version of &rtai; (May 2002).
Details about the exact function prototypes can be found in the &rtai;
reference manual. The following Chapter discusses their
<emphasis>implementation</emphasis>.
The discussion is categorized according to the
contents of the previous Chapters of this document. In summary, the
feature set of &rtai; is quite complete, offering almost all
previously presented concepts. &rtai; also implements some
&posix;<indexterm><primary>&posix;</primary></indexterm> parts
(<xref linkend="os-standards">): it has &posix; 1003.1c
compliant pthreads, mutexes and condition variables, and &posix;
1003.1b compliant pqueues. But &posix; compliance is not high on the
priority list of new developments. (A property that &rtai; shares with
standard &linux; development, by the way.)
</para>

</sect1>


<sect1 id="rtai-tasksched">
<title>Task management and scheduling</title>
<para>
Summary: &rtai; offers the whole variety of real-time tasks
and schedulers.
Besides normal tasks that end up in the scheduler queues of the
operating system, &rtai; offers also non-schedulable units of
execution: <emphasis>tasklets</emphasis>,
<emphasis>timers</emphasis>, <emphasis>&asr;s</emphasis>, and
<emphasis>queue blocks</emphasis>.
</para>

<sect2 id="rtai-task">
<title>Task management</title>
<para>
&rtai; has its own specific &api;, but offers &posix; wrappers for
threads. A task is created with the following function:
<programlisting>
<![CDATA[
int rt_task_init (
   RT_TASK *task,
   void (*rt_thread)(int),
   int data,
   int stack_size,
   int priority,
   int uses_fpu,
   void(*signal)(void)
);
]]>
</programlisting>
The function's arguments are:
<itemizedlist>

<listitem>
<para>
<parameter>task</parameter> is a pointer to an
<parameter>RT_TASK</parameter> type structure whose space must be
provided by the application. It must be kept during the whole lifetime
of the real time task.
</para>
</listitem>

<listitem>
<para>
<parameter>rt_thread</parameter> is the entry point of the task
function. The parent task can pass a single integer value
<parameter>data</parameter> to the new task.
</para>
</listitem>

<listitem>
<para>
<parameter>stack_size</parameter> is the size of the stack to be used by the new
task.
</para>   
</listitem>

<listitem>
<para>
<parameter>priority</parameter> is the priority to be given the task.
The highest <parameter>priority</parameter> is 0, while the lowest is
<parameter>RT_LOWEST_PRIORITY</parameter>.
</para>
</listitem>

<listitem>
<para>
<parameter>uses_fpu</parameter> is a flag. Nonzero value indicates
that the task will save the floating point registers at context
switches.  On a <emphasis>(multi) uni-processor</emphasis>, a
real-time task does <emphasis>not</emphasis> save its
floating point context by default.
However, when the task is created for a <emphasis>symmetric
multi-processing</emphasis> system, the floating point context
<emphasis>is</emphasis> saved, because the task's context must be save
against &cpu; migration anyway.
</para>
</listitem>

<listitem>
<para>
<parameter>signal</parameter> is an &ldquo;&asr;&rdquo; function 
(<xref linkend="sect-idsr">) that is called, within the
task environment and with interrupts disabled, when the task becomes
the current running task after a context switch.
</para>
</listitem>

</itemizedlist>
Here is a typical &rtai; code for creating and starting a real-time
task, from within an <function>init_module()</function>, that
periodically runs the function whose code is in the application
dependent function <function>fun()</function>:
<programlistingco>
<areaspec>
<area id="rttask"   coords=2>
<area id="taskinit" coords=6>
<area id="setrun"   coords=7>
<area id="setfpu"   coords=8>
<area id="gettime"  coords=9>
<area id="mkperio"  coords=10>
<area id="runfun"   coords=17>
<area id="waitper"  coords=21>
</areaspec>
<programlisting>
<![CDATA[
#define STACK_SIZE 4096
static RT_TASK mytask;

int init_module(void)
{
  rt_task_init(&mytask, fun, 0, STACK_SIZE, 0, 1, 0);
  rt_set_runnable_on_cpus(&mytask, ...);
  rt_linux_use_fpu(1);
  now = rt_get_time();
  rt_task_make_periodic( \
    &mytask, now + 2000, 100*1000*1000);
  return 0;
}

// function that runs periodically in 
// the created real-time task:
static void fun(int t) {
  ...
  while (...) {
    ... // do what has to be done each period
    rt_task_wait_period();
  }
}
]]>
</programlisting>
<calloutlist>
<callout arearefs="rttask">
<para>
The task's data structure.
</para>
</callout>
<callout arearefs="taskinit">
<para>
Initialize the task's data structures, giving it, among other things,
values for stack space and static priority.
</para>
</callout>
<callout arearefs="setrun">
<para>
Ask OS to run the task on a <emphasis>selection</emphasis> of
specified processors.
</para>
</callout>
<callout arearefs="setfpu">
<para>
Ask the OS to save the floating point registers when switching
contexts.
</para>
</callout>
<callout arearefs="gettime">
<para>
Read in the current absolute time.
</para>
</callout>
<callout arearefs="mkperio">
<para>
Ask the OS to run this task periodically. This call also sets the
first time instant that the thread wants to become active, and its
period. These times are in <emphasis>nanoseconds</emphasis>.
</para>
</callout>
<callout arearefs="runfun">
<para>
The function to be run in the real-time task.
</para>
</callout>
<callout arearefs="waitper">
<para>
Go to sleep until the schedulers wakes you up when your timer expires.
</para>
</callout>
</calloutlist>
</programlistingco>
Using &rtai;'s wrappers for &posix; pthreads, a typical skeleton for
task (&ldquo;thread&rdquo;) creation and activation looks like this:
<programlistingco>
<areaspec>
<area id="pthreadcreate" coords=14>
<area id="pthreadsignal" coords=19>
<area id="pthreadsched"  coords=32>
<area id="pthreadtimed"  coords=38>
</areaspec>
<programlisting>
<![CDATA[
static pthread_t       thread; // POSIX data structures for task,
static pthread_mutex_t mutex;  // ... mutex,
static pthread_cond_t  cond;   // ... and condition variable,

int init_module(void)
{
  pthread_attr_t attr;         // POSIX data structure for
                               // task properties 

  pthread_attr_init  (&attr);        // initializations
  pthread_mutex_init (&mutex, NULL); // ...
  pthread_cond_init  (&cond, NULL);  // ...

  pthread_create (&thread, &attr, fun, 0);

  ... // doing stuff until it's time to delete thread

  pthread_mutex_lock (&mutex);
  pthread_cond_signal (&cond);
  pthread_mutex_unlock (&mutex);
  ...
}

// function that runs periodically in 
// the created real-time task:
static void fun(int t) {
  #define TIME_OUT 10000000

  ...
  struct sched_param p;
  p.sched_priority = 1;
  pthread_setschedparam (pthread_self(), SCHED_FIFO, &p);

  ...
  while (1) {
    time = ...;
    pthread_mutex_lock (&mutex);
    pthread_cond_timedwait (&cond, &mutex, time+TIME_OUT));
    pthread_mutex_unlock (&mutex);
    ... // do what has to be done each period
  }
}
]]>
</programlisting>
<calloutlist>
<callout arearefs="pthreadcreate">
<para>
Here, the task is created, i.e., its data structures and function to
execute are initialized.
</para>
</callout>
<callout arearefs="pthreadsignal">
<para>
The created task is signaled; the signal is typically the notification
that the thread should stop itself, in a clean way.
</para>
</callout>
<callout arearefs="pthreadsched">
<para>
The thread itself fills in its scheduling properties. (This could also
be done by another task.)
</para>
</callout>
<callout arearefs="pthreadtimed">
<para>
This command makes the task sleep until the specified next
wake-up time, or until it receives the signal to clean up. (This
signal could have another, task-dependent interpretation too, of
course.)
</para>
</callout>
</calloutlist>
</programlistingco>
The function <function>pthread_cond_timedwait()</function> is used to,
both, wait for a time to expire, and for a condition to be signaled
(<xref linkend="sect-condvar">):
<programlisting>
<![CDATA[
int pthread_cond_timedwait(
   pthread_cond_t *cond,     // condition variable
   pthread_mutex_t *mutex,   // mutex to protect scope
   struct timespec *abstime);// absolute time to wake up
]]>
</programlisting>
So, the semantics of the pure &rtai; and the &posix; implementations
are not exactly the same. Another difference between both versions is
that a &posix; thread initialization makes the task active immediately,
while the task created by a <function>rt_task_init()</function> is
suspended when created, and must be activated explicitly. (This is
achieved by the second argument in
<function>rt_task_make_periodic()</function>: it specifies the time
when the task will be first woken up.)
</para>

</sect2>


<sect2 id="rtai-tasklets">
<title>Tasklets</title>
<para>
The data structure to hold the status and the data connected to a
tasklet (<xref linkend="linux-tasks">) is created with the following
function:
<programlisting>
<![CDATA[
struct rt_tasklet_struct *rt_init_tasklet(void)
]]>
</programlisting>
It is configured with
<programlisting>
<![CDATA[
int rt_insert_tasklet(
   struct rt_tasklet_struct *tasklet,  // data structure
   int priority,                       // static priority
   void (*handler)(unsigned long),     // function to execute
   unsigned long data,                 // data to pass to handler
   unsigned long id,                   // user-defined identifier
   int pid)                            // OS process identifier
]]>
</programlisting>
There also exist function calls with which one can set most of the
above-mentioned properties separately. &rtai; executes the tasklets
before it runs its scheduler. And tasklets can set priorities to
influence the order in which the operating system executes them.  An
application can also execute a tasklet explicitly (or rather, wake
it up for execution) by a
<function>rt_tasklet_exec(tasklet)</function> function call. Tasklets
do <emphasis>not</emphasis> save their floating point registers by
default.
</para>

</sect2>


<sect2 id="rtai-timers">
<title>Timers</title>
<para>
<indexterm><primary>timer</primary></indexterm>
These are nothing else but timed tasklets, so its interface functions
have the same semantics as those of tasklets.
<function>rt_init_timer()</function> is in fact a copy of
the <function>rt_init_tasklet()</function> function.  The major
difference lies in the <function>rt_insert_timer()</function>
function, which inserts the timer tasklet in a list of timers to be
processed by a <emphasis>time manager<indexterm>
<primary>time manager</primary></indexterm></emphasis> task. This
function has two more parameters than
<function>rt_insert_tasklet</function>, which give the tasklet the
semantics of a timer:
<programlisting>
<![CDATA[
int rt_insert_timer(
   struct rt_tasklet_struct *timer,
   int priority,
   RTIME firing_time,         // fire time
   RTIME period,              // period, if timer must be periodic
   void (*handler)(unsigned long),
   unsigned long data,
   int pid)
]]>
</programlisting>
The <parameter>pid</parameter>
parameter is not needed, because the timer tasklet will never be
referred to as a &ldquo;real&rdquo; task anyway, i.e., as a task that
is scheduled by the scheduler. So, some of the fields in the timer
data structure (which is equal to the tasklet data structure) are not
used.  The timer list is ordered according to the desired fire time of
the timer tasklets. The time manager always inherits the priority of
the highest-priority timer.
</para>

</sect2>


<sect2 id="rtai-asr">
<title>&asr;</title>
<para>
Via the <parameter>signal</parameter> parameter of
<function>rt_task_init()</function>, the application programmer can
register a function that will be executed whenever the task it belongs
to will be scheduled, and <emphasis>before</emphasis> that task is
scheduled.  This is the functionality of what is sometimes called an 
<emphasis>Asynchronous Service Routine<indexterm>
<primary>Asynchronous Service Routine</primary></indexterm>
<indexterm><primary>&asr;</primary></indexterm></emphasis>
in other operating systems (<xref linkend="sect-idsr">).
An &asr; is different from a tasklet, in the following sense: 
<itemizedlist>
<listitem>
<para>
the &asr;'s function is executed in the context of the task it belongs
to, while a tasklet has its own context.
</para>
</listitem>

<listitem>
<para>
The &asr; is run with interrupts disabled. (This is not always the
case for &asr;s in other operating systems.)
</para>
</listitem>

<listitem>
<para>
The &asr; is not a schedulable task itself, i.e., it will never show
up in the scheduling queues, just like the timer tasklets.
</para>
</listitem>

</itemizedlist>
</para>

</sect2>

<sect2 id="rtai-qblks">
<title>Queue blocks</title>
<para>
(TODO: what is the real use of queue blocks? Seems to be a primitive
that somebody happened to have implemented (on &qnx;) and ported to
&rtai; without filling a real need?)
</para>
<para>
Queue blocks are simple structures that contain a
pointer to a function and the time at which the function must be
executed. The queue blocks are linked into a list and a family of
functions are provided to manage the whole thing.  The functions are
of the type
<function>void (*handler)(void *data, int event)</function>, and therefore
the simple structures also include the arguments data and
event. The application may or may not use any of the arguments.
</para>

</sect2>


<sect2 id="rtai-sched">
<title>Task scheduling</title>
<para>
&rtai; provides several complementary scheduling configuration
options:
<itemizedlist>

<listitem>
<para>
Depending on the hardware, the following scheduling options
are available:
uni-processor scheduling (UP),<indexterm>
<primary>uni-processor scheduling</primary></indexterm>
<indexterm><primary>UP scheduling</primary></indexterm>
<indexterm>
 <primary>scheduling</primary><secondary>uni-processor</secondary>
</indexterm>
multi-processor scheduling (MUP;<indexterm>
<primary>multi-processor scheduling</primary></indexterm>
<indexterm><primary>MUP scheduling</primary></indexterm>
<indexterm>
 <primary>scheduling</primary><secondary>multi-processor</secondary>
</indexterm>
the application programmer can assign each task to a specific (set of)
processors), and symmetric multi-processor systems (SMP;<indexterm>
<primary>symmetric multi-processor scheduling</primary></indexterm>
<indexterm><primary>SMP scheduling</primary></indexterm>
<indexterm>
 <primary>scheduling</primary>
 <secondary>symmetric multi-processor</secondary>
</indexterm>
the scheduler assigns tasks at run-time to any available processor).
</para>
</listitem>

<listitem>
<para>
Tasks can configure <emphasis>periodic scheduling<indexterm>
<primary>periodic scheduling</primary></indexterm>
<indexterm>
 <primary>scheduling</primary><secondary>periodic</secondary>
</indexterm></emphasis>
(scheduled every time a certain time has elapsed) and
<emphasis>one-shot scheduling<indexterm>
<primary>one-shot scheduling</primary></indexterm>
<indexterm>
 <primary>scheduling</primary><secondary>one-shot</secondary>
</indexterm></emphasis>
(scheduled only once at the requested time).
</para>
</listitem>

<listitem>
<para>
&rtai; has static priority-based scheduling
(&ldquo;<parameter>SCHED_FIFO</parameter>&rdquo;) as its default hard
real-time scheduler, but if offers also Round Robin time-sliced
scheduling (&ldquo;<parameter>SCHED_RR</parameter>&rdquo;),
<emphasis>Rate Monotonic Scheduling</emphasis>, and
<emphasis>Earliest Deadline First</emphasis>.
It's the responsibility of the <emphasis>application
programmer</emphasis> to get the scheduler and timings choices
correct.  When multiple scheduler schemes are used, &rtai; has made
the (arbitrary) choice to give EDF tasks a
higher priority than tasks scheduled with other policies.
</para>
</listitem>

</itemizedlist>
By definition (<xref linkend="prior-sched">), only
<parameter>SCHED_FIFO</parameter> and <parameter>SCHED_RR</parameter>
can be chosen on a per task basis, and with a per task quantum time
(only relevant for <parameter>SCHED_RR</parameter>):
<programlisting>
<![CDATA[
 rt_set_sched_policy(
   RT_TASK *task,     // pointer to task's data structure
   int policy,        // 0: RT_SCHED_FIFO, 1: RT_SCHED_RR
   int rr_quantum_ns  // RR time slice in nanoseconds, lying between
                      // 0 (= default Linux value) and 
                      // 0x0FFFFFFF (= 1/4th of a second)
),
]]>
</programlisting>
(Needing Round Robin scheduling in an application program should
be considered as an indication that the program logic is poorly
designed&hellip;)
The EDF and RMS schedulers need <emphasis>global</emphasis>
information about the task timings, so the procedures are a little bit
more complex:
<itemizedlist>

<listitem>
<para>
<emphasis>RMS</emphasis>:<indexterm>
<primary>Rate Monotonic</primary></indexterm>
<indexterm>
 <primary>scheduling</primary>
 <secondary>Rate Monotonic</secondary>
</indexterm>
the RMS scheduler is (re)initialized by the function
<function>void rt_spv_RMS(int cpuid)</function>, to be called
<emphasis>after</emphasis> the operating system knows the timing
information of <emphasis>all</emphasis> your tasks. That is, after you
have made all of your tasks periodic at the beginning of your
application, or after you create a periodic task dynamically, or after
changing the period of a task.  The <parameter>cpuid</parameter>
parameter of the function <function>rt_spv_RMS()</function> is only
used by the multi uni-processor scheduler.
</para>
</listitem>

<listitem>
<para>
<emphasis>EDF</emphasis>:<indexterm>
<primary>Earliest Deadline First</primary></indexterm>
<indexterm>
 <primary>scheduling</primary>
 <secondary>Earliest Deadline First</secondary>
</indexterm>
this scheduler must know the
<emphasis>start</emphasis> and <emphasis>termination</emphasis> times
of all your tasks, so a task must call
the function
<programlisting>
<![CDATA[
  void rt_task_set_resume_end(RTIME resume_time, RTIME end_time);
]]>
</programlisting>
at the end of <emphasis>every</emphasis> run of one cycle of the task.
</para>
</listitem>

</itemizedlist>
&rtai; provides several function calls that influence task scheduling
(<filename>ABCscheduler/rtai_sched.c</filename>):
<programlisting>
<![CDATA[
void rt_task_yield(void);
   // stops the current task and takes it at the end of the list of
   // ready tasks, with the same priority. The scheduler makes the
   // next ready task of the same priority active.  

int rt_task_suspend(RT_TASK *task);
   // suspends execution of the "task". It will not be executed
   // until a call to "rt_task_resume()" or
   // "rt_task_make_periodic()" is made.

int rt_task_resume(RT_TASK *task);
   // resumes execution of the "task" previously suspended by
   // "rt_task_suspend()" or makes a newly created task ready to run.

int rt_task_make_periodic(
  RT_TASK *task,
  RTIME start_time,
  RTIME period);
   // mark the "task" as available for periodic execution, with 
   // period "period", when "rt_task_wait_period()" is called.
   // The time of the task's first execution is given by
   // "start_time", an absolute value measured in clock ticks.

int rt_task_make_periodic_relative_ns(
  RT_TASK *task,
  RTIME start_delay,
  RTIME period);
   // As "rt_task_make_periodic", but with "start_delay" relative 
   // to the current time and measured in nanosecs.

void rt_task_wait_period(void);
   // suspends the execution of the currently running task until 
   // the next period is reached. The task must have been previously
   // marked for execution with "rt_task_make_periodic()" or
   // "rt_task_make_periodic_relative_ns()".
   // The task is suspended only temporarily, i.e. it simply gives up
   // control until the next time period.

void rt_task_set_resume_end_times(RTIME resume, RTIME end);
int rt_set_resume_time(RT_TASK *task, RTIME new_resume_time);
int rt_set_period(RT_TASK *task, RTIME new_period);

RTIME next_period(void);
   // returns the time when the caller task will run next.

void rt_busy_sleep(int ns);
   // delays the execution of the caller task without giving back
   // the control to the scheduler. This function burns away CPU 
   // cycles in a busy wait loop. It may be used for very short
   // synchronization delays only. "nanosecs" is the number of
   // nanoseconds to wait.

void rt_sleep(RTIME delay);
   // suspends execution of the caller task for a time of
   // "delay" internal count units. During this time the CPU is 
   // used by other tasks.
   // A higher priority task or interrupt handler can run
   // during the sleep, so the actual time spent in this function
   // may be longer than the specified time.
          
void rt_sleep_until(RTIME time);
   // similar to "rt_sleep", but the parameter "time" is the
   // absolute time untill when the task is suspended. If the
   // given time is already passed this call has no effect.

int rt_task_wakeup_sleeping(RT_TASK *task);
]]>
</programlisting>
The status of a task can be found with:
<programlisting>
<![CDATA[
int rt_get_task_state (RT_TASK *task);
]]>
</programlisting>
The task state is formed by the bitwise OR of one or more of the
following flags:
<itemizedlist>

<listitem>
<para>
<parameter>READY</parameter>: task is ready to run (i.e. unblocked).
</para>
</listitem>

<listitem>
<para>
<parameter>SUSPENDED</parameter>: task is suspended.
</para>
</listitem>

<listitem>
<para>
<parameter>DELAYED</parameter>: task waits for its next running period
or expiration of a timeout.
</para>
</listitem>

<listitem>
<para>
<parameter>SEMAPHORE</parameter>: task is blocked on a semaphore.
</para>
</listitem>

<listitem>
<para>
<parameter>SEND</parameter>: task sent a message and waits for the
receiver task.
</para>
</listitem>

<listitem>
<para>
<parameter>RECEIVE</parameter>: task waits for an incoming message.
</para>
</listitem>

<listitem>
<para>
<parameter>RPC</parameter>: task sent a Remote Procedure Call and the
receiver has not got it yet.
</para>
</listitem>

<listitem>
<para>
<parameter>RETURN</parameter>: task waits for reply to a Remote
Procedure Call.
</para>
</listitem>

</itemizedlist>
The returned task state is just an approximative information. Timer
and other hardware interrupts may cause a change in the state of the
queried task before the caller could evaluate the returned value. The
caller should disable interrupts if it wants reliable info about
another task.
</para>
<para>
A task can find its own task data structure with:
<programlisting>
<![CDATA[
RT_TASK *rt_whoami (void); 
]]>
</programlisting>
</para>
<para>
Tasks can choose whether or not to save floating point registers at
context switches:
<programlisting>
<![CDATA[
int rt_task_use_fpu (RT_TASK* task, int use_fpu_flag);
   // informs the scheduler that floating point arithmetic 
   // operations will be used by the "task".
   // If "use_fpu_flag" has a nonzero value, FPU context is also
   // switched when task or the kernel become active. This makes task
   // switching slower. The initial value of this flag is set by
   // "rt_task_init()" when the real time task is created. By default,
   // a Linux "task" has this flag cleared. It can be set with the
   // "LinuxFpu" command line parameter of the "rtai_sched" module.

void rt_linux_use_fpu (int use_fpu_flag);
   // informs the scheduler that floating point arithmetic 
   // operations will be used in the background task (i.e.,
   // the Linux kernel itself and all of its processes).
]]>
</programlisting>
</para>

</sect2>


<sect2 id="rtai-time">
<title>Getting the time</title>
<para>
&rtai; provides several function calls for getting the current time
(<filename>ABCscheduler/rtai_sched.c</filename>):
<programlisting>
<![CDATA[
RTIME rt_get_time(void)
RTIME rt_get_time_cpuid(unsigned int cpuid)
RTIME rt_get_time_ns(void)
RTIME rt_get_time_ns_cpuid(unsigned int cpuid)
RTIME rt_get_cpu_time_ns(void)
]]>
</programlisting>
The time is given in &ldquo;ticks&rdquo;, or in nanoseconds. The
parameter <parameter>cpuid</parameter> indicates the number of the
&cpu; in a multi-processor system, and these calls (and
<function>rt_get_cpu_time_ns</function>) read the local
<emphasis>Time Stamp Clock</emphasis>,<indexterm>
<primary>Time Stamp Clock</primary></indexterm>
<indexterm> <primary>TSC</primary></indexterm>
instead of the external timer chip. The latter has usually a lower
resolution.
</para>

</sect2>

</sect1>


<sect1 id="rtai-int">
<title>Interrupts and traps</title>
<para>
An interrupt handler (<xref linkend="sect-inter-sw">) must be
registered with the operating system via a call of the following
function:
<programlisting>
<![CDATA[
int rt_request_global_irq (
   unsigned int irq,
   void (*handler)(void)
);
]]>
</programlisting>
This call installs the function <parameter>handler</parameter> as the
interrupt service routine for IRQ level <parameter>irq</parameter>.
<parameter>handler</parameter> is then invoked whenever interrupt
number <parameter>irq</parameter> occurs. The installed handler must
take care of properly activating any &linux; handler using the same irq
number, by calling the
 <function>void rt_pend_linux_irq (unsigned int irq)</function>
function, which ``pends'' the interrupt to &linux; (in software!).
This means that &linux; will process the interrupt as soon as it gets
control back from &rtai;. Note that, at that time, hardware interrupts
are again <emphasis>enabled</emphasis> for &rtai;. 
The use of <function>rt_pend_linux_irq()</function> does only make
sense for <emphasis>edge-triggered<indexterm>
<primary>edge-triggered interrupt</primary></indexterm></emphasis>
<indexterm>
 <primary>interrupt</primary><secondary>edge-triggered</secondary>
</indexterm>
interrupts (<xref linkend="sect-inter-hw">):
the level-triggered<indexterm> 
<primary>level-triggered interrupt</primary></indexterm>
<indexterm>
 <primary>interrupt</primary><secondary>edge-triggered</secondary>
</indexterm> one is still active, unless you have
acknowledged it already
<emphasis>explicitly</emphasis>.
</para>
<para>
From an &rtai; task, one can also register an interrupt handler with
&linux;, via
<programlisting>
<![CDATA[
int rt_request_linux_irq (
   unsigned int irq,
   void (*handler)(int irq, void *dev_id, struct pt_regs *regs),
   char *linux_handler_id,
   void *dev_id
);
]]>
</programlisting>
This forces &linux; to share the interrupt. The handler is appended
to any already existing &linux; handler for the same irq and run
as a &linux; irq handler. The handler appears
in <filename>/proc/interrupts</filename>, under the name given in the
parameter <parameter>linux_handler_id</parameter>.
The parameter
<parameter>dev_id</parameter> is passed to the interrupt handler, in
the same way as the standard &linux; irq request call.
<programlisting>
<![CDATA[
void rt_request_timer (
   void (*handler)(void),
   int tick,
   int apic
);
]]>
</programlisting>
registers the <parameter>handler</parameter> as the &isr; of a timer
interrupt. If <parameter>tick</parameter> is zero, the timer is
executed only once. If <parameter>apic</parameter> is nonzero, the
local APIC is used (<xref linkend="sect-inter-hw">).
The difference with the timer tasklets (<xref linkend="rtai-timers">)
is that the latter are not directly registered as an interrupt
handler, but executed by a timer manager (which is itself woken up by
a timer).
</para>

<para>
Floating point register saving is <emphasis>on by default</emphasis>
in &rtai; interrupt handlers.
The &dsr; functionality (<xref linkend="sect-idsr">) is available
through tasklets, and &asr; functionality through the
<function>signal()</function> parameter.  One can also select which
&cpu; must receive and handle a particular IRQ, via the
<function>rt_assign_irq_to_cpu(int irq, int cpu)</function> function.
<function>rt_reset_irq_to_sym_mode(int irq)</function> resets this
choice, back to the symmetric &ldquo;don't care&rdquo; behaviour.
</para>
<para>
In &rtai;, application programmers must explicitly enable interrupts
themselves, via <function>rt_irq_enable()</function>. Whether this is
done in the &isr; or in the &dsr; depends on the hardware of the
application: if it has an interrupt ready immediately, enabling the
interrupts in the &isr; could cause recursive calls to the &isr;,
possibly blocking the system.
</para>

<para>
<xref linkend="sect-inter-sw"> discussed the concept of
<emphasis>traps<indexterm>
<primary>trap</primary></indexterm>
</emphasis> and <emphasis>trap handlers<indexterm>
<primary>trap handler</primary></indexterm>
<indexterm>
 <primary>handler</primary><secondary>trap</secondary>
</indexterm></emphasis>. The &api; that &rtai; offers is as follows:
<programlisting>
<![CDATA[
   // data structure of handler
typedef int (*RT_TRAP_HANDLER)(
  int,                            // interrupt vec
  int,                            // signal number
  struct pt_regs *,               // argument pointers that can be
                                  // given to a trap handler (see Linux)
  void *                          // data pointer
);

   // fill in trap handler data structure:
int rt_trap_handler(
  int vec,
  int signo,
  struct pt_regs *regs,
  void *dummy_data
);

   // register trap handler:
RT_TRAP_HANDLER rt_set_task_trap_handler(
  RT_TASK *task,          // task which registers handler
  unsigned int vec,       // interrupt vec which triggers handler
  RT_TRAP_HANDLER handler // data structure of handler
);
]]>
</programlisting>
&rtai; reserves 32 <emphasis>system signals<indexterm>
<primary>signal</primary></indexterm></emphasis>,
most of them correspond to what standard &linux; uses.  These signals
are denoted by &ldquo;<parameter>signo</parameter>&rdquo; in the code
above, and are defined in the data structure
<parameter>rtai_signr[NR_TRAPS]</parameter> in the file
<filename>"arch/i386/rtai.c</filename>, for i386 only.
The default configuration policies of &rtai; are: (i) to add the same
handler to all traps, (ii) to trap the non-maskable interrupt of the
processor and let it do nothing (getting it in the first place
indicates that something major has gone wrong), and (iii) to suspend a
task that calls a non-existing handler.
</para>

</sect1>


<sect1 id="rtai-ipc-sync">
<title>&ipc;: synchronization</title>
<para>
Also in this area, &rtai; offers the whole range of synchronization
primitives: semaphore and mutex, condition variable, and barrier or
flags (&ldquo;bits&rdquo;).


<sect2 id="rtai-ipc-sem">
<title>Semaphore and mutex</title>
<para>
&rtai; has counting semaphores,<indexterm>
<primary>counting semaphore</primary></indexterm>
<indexterm>
 <primary>semaphore</primary>
 <secondary>counting</secondary>
</indexterm>
binary semaphores<indexterm>
<primary>binary semaphore</primary></indexterm>
<indexterm>
 <primary>semaphore</primary><secondary>binary</secondary>
</indexterm>
 and
recursive semaphores<indexterm>
<primary>recursive semaphore</primary></indexterm>
<indexterm>
 <primary>semaphore</primary><secondary>recursive</secondary>
</indexterm>,
<xref linkend="sect-semaphore">.
semaphores can block tasks
waiting on them in FIFO or priority order;
</para>
<para>
Semaphores in &rtai; have the following &api;:
<programlisting>
<![CDATA[
 void rt_sem_init      (SEM* sem, int value);
 int rt_sem_signal     (SEM* sem);
 int rt_sem_wait       (SEM* sem);
   // version that returns immediately when not free:
 int rt_sem_wait_if    (SEM* sem);
   // versions with a timeout:
 int rt_sem_wait_until (SEM* sem, RTIME time);  // absolute time
 int rt_sem_wait_timed (SEM* sem, RTIME delay); // relative time
]]>
</programlisting>
&rtai; semaphores have
<emphasis>priority inheritance<indexterm>
<primary>priority inheritance</primary></indexterm></emphasis>.
and (adaptive)
<emphasis>priority ceiling<indexterm>
<primary>priority ceiling</primary></indexterm></emphasis>
(<xref linkend="sect-prior-inherit">). 
</para>

</sect2>


<sect2 id="rtai-mutex">
<title>POSIX mutex</title>
<para>
&rtai; implements the standard
&posix; mutexes<indexterm><primary>mutex</primary></indexterm>
(<xref linkend="sect-mutex">), with the prescribed
<emphasis>priority inheritance<indexterm>
<primary>priority inheritance</primary></indexterm></emphasis>.
The &api; is, of course, the standard &posix; &api; as presented in
<xref linkend="sect-mutex">.
</para>

</sect2>


<sect2 id="rtai-ipc-spinlocks">
<title>Spinlocks</title>
<para>
Application programmers can choose from a wide variety of spinlocks,
each with well-defined scope. Basically, they look like the spinlocks
in &linux;, with a &ldquo;<function>rt_</function>&rdquo; prefix, but
using the same data structures. But the &rtai; spinlocks need an extra
level with respect to &linux;, because &linux; runs on an hardware
simulation layer as soon as &rtai; has been activated.  Indeed, from
that moment on, the &linux; calls are replaced by &ldquo;soft&rdquo;
versions, in the sense that &rtai; can always pre-empt critical
&linux; sections. Here is the list of &rtai; spinlocks:
<programlisting>
<![CDATA[
unsigned long flags;
spinlock_t lock;

rt_spin_lock(&lock);
 /* critical section in Linux (as the `spin_lock()' there, hence
    Linux's (soft) interrupts still pass), but pre-emptable by RTAI.
 */
rt_spin_unlock(&lock);

rt_spin_lock_irq(&lock);
 /* same as above but Linux's soft interrupts disabled. */
rt_spin_unlock_irq(&lock);

flags = rt_spin_lock_irqsave(&lock);
 /* critical section in RTAI with hardware interrupts disabled
    on current CPU. */
rt_spin_lock_irqrestore(flags,&lock);
]]>
</programlisting>
The following locks don't need a lock data structure, because they
are drastic, and use a &ldquo;global lock&rdquo; over all processors:
<programlisting>
<![CDATA[
rt_global_cli();
 /* critical section with interrupts disabled on the calling CPU,
    and "global lock" for all CPUs. */
rt_global_sti();

flags = rt_global_save_flags_and_cli();
 /* as "rt_global_cli()", but saves the state of the interrupt flag,
    and the "global lock" flag. */
rt_global_restore_flags(flags);

flags = hard_lock_all();
 /* Most drastic way of making the system safe from pre-emption by
    interrupts.
    On UP boxes is the same as "rt_global_save_flags_and_cli()"
    above. On SMP locks out all the other CPUs, sending then an
    IPI (inter-processor interrupt) signal. */
hard_unlock_all(flags);
]]>
</programlisting>
The normal &linux; spinlocks still work in &rtai;, so be careful when
using them, because they won't always offer the same protection in
&rtai; hard real ime as what you expect from knowing how they behave
in un-modified &linux;.
</para>

</sect2>


<sect2 id="rtai-ipc-condvar">
<title>Condition variable</title>
<para>
&rtai; implements the standard &posix; condition variables
(<xref linkend="sect-condvar">).
</para>

</sect2>


<sect2 id="rtai-ipc-barrier">
<title>Barrier/flags</title>
<para>
&rtai; has a barrier-like (<xref linkend="sect-barrier">) primitive,
which it calls <emphasis>bits</emphasis>. It allows tasks to suspend
on an AND or OR combination of bits sets in a 32 bit mask called
&ldquo;BITS&rdquo; (<filename>include/rtai_bits.h</filename>):
<programlisting>
<![CDATA[
struct rt_bits_struct {
   struct rt_queue queue;  // must be first in struct
   int magic;
   int type;  // needed because BITS and semaphores share some things
   unsigned long mask;
};

typedef struct rt_bits_struct BITS;
]]>
</programlisting>
Tasks can read and write bits in this mask, and perform
&ldquo;wait&rdquo; calls on the mask. The full &api: is as follows:
<programlisting>
<![CDATA[
#include <rtai_bits.h>

   // basic bit operation functions, indicated by macros:
#define SET_BITS              0
#define CLR_BITS              1
#define SET_CLR_BITS          2
#define NOP_BITS              3

void rt_bits_init(BITS *bits, unsigned long mask)
   // create and initialize the bits structure pointed to by "bits",
   // setting bits mask to "mask".

int rt_bits_delete(BITS *bits)
   // delete the "bits" data structure

unsigned long rt_get_bits(BITS *bits)
   // get the actual value of the "bits" mask.

unsigned long rt_bits_signal(
  BITS *bits,
  int setfun,
  unsigned long masks)
   // execute "setfun" (which is any of the basic bits operations
   //  above: SET_BITS, etc.), oring/anding masks onto the actual 
   // bits mask, schedule any task blocked on "bits" if the new bits
   // mask meets its request;
   // returns the value of bits after executing setfun;
   // in case of combined operations (AND and OR), "masks" is to be 
   // cast to a pointer of a two elements array of unsigned longs 
   // containing the masks to be used for the combined "setfun".

int rt_bits_reset(BITS *bits, unsigned long mask)
   // unconditionally schedule any task blocked on "bits" and 
   // reset its mask to "mask";
   // returns the value of bits mask before being reset to "mask".

int rt_bits_wait(
  BITS *bits,
  int testfun,
  unsigned long testmasks,
  int exitfun,
  unsigned long exitmasks,
  unsigned long *resulting_mask)
   // test "bits" mask against "testmasks" according to "testfun"
   // (which is any of the test functions above, e.g., SET_BIT, etc.);
   // if the test is not satisfied block the task;
   // whenever the condition is met, execute "exitfun:, and any bits
   // operation above, using "exitmasks",
   // save the the mask resulting after the whole processing in the
   // variable pointed by "resulting_mask".

int rt_bits_wait_if(
  BITS *bits,
  int testfun,
  unsigned long testmasks,
  int exitfun,
  unsigned long exitmasks,
  unsigned long *resulting_mask)
   // as "rt_bits_wait",
   // but does not block if "testfun" is not satisfied.

int rt_bits_wait_until(
  BITS *bits,
  int testfun,
  unsigned long
  testmasks,
  int exitfun,
  unsigned long exitmasks,
  RTIME time,
  unsigned long *resulting_mask)
   // as "rt_bits_wait",
   //  but waits at most till "time" expires.

unsigned long rt_bits_wait_timed(
  BITS *bits,
  int testfun,
  unsigned long testmasks,
  int exitfun,
  unsigned long exitmasks,
  RTIME delay,
  unsigned long *resulting_mask)
   // as "rt_bits_wait_until",
   // but waits at most for "delay" to meet the required condition.
]]>
</programlisting>
</para>

</sect2>

</sect1>


<sect1 id="rtai-ipc-dataexch">
<title>&ipc;: data exchange.</title>
<para>
&rtai; has messages, mailboxes, and &posix; message queues
(&ldquo;pqueues&rdquo;), including synchronous message passing
semantics (<xref linkend="sect-mess">), &fifo;s,
Remote Procedure Calls, and shared memory. 
</para>


<sect2 id="rtai-messages">
<title>Messages</title>
<para>
<indexterm><primary>message</primary></indexterm>
<indexterm>
 <primary>&ipc;</primary><secondary>message</secondary>
</indexterm>
&rtai; makes the distinction between messages and mailboxes, as
explained in <xref linkend="sect-mess">. The messages are the more
primitive form, and in &rtai;, the basic implementation of messages
carry only a <emphasis>four byte</emphasis> message in the call
itself. So, no buffering must be provided.
The &api; for this simple inter-task messaging is:
<programlisting>
<![CDATA[
RT_TASK* rt_send (RT_TASK* task, unsigned int msg);
  // sends the message "msg" to the task "task". 
  // If the receiver task is ready to get the message,
  // "rt_send" returns immediately.
  // Otherwise the caller task is blocked.

RT_TASK* rt_send_if (RT_TASK* task, unsigned int msg);
  // sends the message ``if possible''. If the receiver task is not
  // ready, the sending task just continues.
  // On success, "task" (the pointer to the task that received the
  // message) is returned.
  // If message has not been sent, 0 is returned.

RT_TASK* rt_send_until (RT_TASK* task, unsigned int msg, RTIME time);
RT_TASK* rt_send_timed (RT_TASK* task, unsigned int msg, RTIME delay);
  // As "rt_send", but the sending is given up after either an
  // absolute "time", or a relative "delay".

RT_TASK* rt_receive (RT_TASK* task, unsigned int *msg);   
  // gets a message from the "task", and stores it in the buffer "msg"
  // that the caller task provides.
  // If "task" is equal to 0, the caller accepts messages from any
  // task. If there is a pending message, "rt_receive" returns
  // immediately. Otherwise the caller task is blocked and queued up.

RT_TASK* rt_receive_if (RT_TASK* task, unsigned int *msg);
  // as "rt_receive", but only ``if possible''.

RT_TASK* rt_receive_until (RT_TASK* task, unsigned int *msg, RTIME time);
RT_TASK* rt_receive_timed (RT_TASK* task, unsigned int *msg, RTIME delay);
  // as "rt_receive", but with time limits as in the send calls.
]]>
</programlisting>
Blocking may happen in priority order or on a FIFO base. This is determined by
an &rtai; compile time option <parameter>MSG_PRIORD</parameter>.)
</para>
<para>
More recently, &rtai; got so-called
<emphasis>extended messages</emphasis>. These are less efficient than
their four-byte cousins, but more flexible in that they allow messages
of arbitrary size. To this end, the extended message functions use a
double buffer data structure:
<programlisting>
<![CDATA[
struct mcb_t {
  void *sbuf;  // buffer for the sender
  int sbytes;  // number of bytes sent
  void *rbuf;  // buffer for the receiver
  int rbytes;  // number of bytes received
};
]]>
</programlisting>
The following function prototypes are quite self-explanatory, with
<parameter>smsg</parameter> indicating the sender's message buffer,
<parameter>ssize</parameter> the sender's message size, and
<parameter>rmsg</parameter> and <parameter>rsize</parameter> similarly
for the receiver.
<programlisting>
<![CDATA[
RT_TASK *rt_sendx(RT_TASK *task, void *smsg, int ssize)

RT_TASK *rt_sendx_if(RT_TASK *task, void *smsg, int ssize)

RT_TASK *rt_sendx_until(
  RT_TASK *task,
  void *smsg,
  int ssize,
  RTIME time)

RT_TASK *rt_sendx_timed(
  RT_TASK *task,
  void *smsg,
  int ssize,
  RTIME delay)

RT_TASK *rt_receivex(
  RT_TASK *task,
  void *msg,
  int size,
  int *truesize)

RT_TASK *rt_receivex_if(
  RT_TASK *task,
  void *msg,
  int size,
  int *truesize)

RT_TASK *rt_receivex_until(
  RT_TASK *task,
  void *msg,
  int size,
  int *truesize,
  RTIME time)

RT_TASK *rt_receivex_timed(
  RT_TASK *task,
  void *msg,
  int size,
  int *truesize,
  RTIME delay)

RT_TASK *rt_rpcx(
  RT_TASK *task,
  void *smsg,
  void *rmsg,
  int ssize,
  int rsize)

RT_TASK *rt_rpcx_if(
  RT_TASK *task,
  void *smsg,
  void *rmsg,
  int ssize,
  int rsize)

RT_TASK *rt_rpcx_until(
  RT_TASK *task,
  void *smsg,
  void *rmsg,
  int ssize,
  int rsize,
  RTIME time)

RT_TASK *rt_rpcx_timed(
  RT_TASK *task,
  void *smsg,
  void *rmsg,
  int ssize,
  int rsize,
  RTIME delay)

T_TASK *rt_returnx(RT_TASK *task, void *msg, int size)
   // ???

int rt_isrpcx(RT_TASK *task)
   // ???
]]>
</programlisting>
</para>

</sect2>


<sect2 id="rtai-mbx">
<title>Mailboxes</title>
<para>
<indexterm><primary>mailbox</primary></indexterm>
<indexterm>
 <primary>&ipc;</primary><secondary>mailbox</secondary>
</indexterm>
&rtai; supports mailboxes (<xref linkend="sect-mess">). They are 
flexible in the sense that they allow to send
any message size by using any mailbox buffer size. The original
implementation uses a FIFO (First In, First Out) policy; a recent
addition are &ldquo;typed&rdquo; mailboxes, that have a priority
message delivery option.
Sending and receiving messages can be done with several policies:
<itemizedlist>

<listitem>
<para>
<emphasis>Unconditionally</emphasis>: the task blocks until the
whole message has passed.
</para>
</listitem>

<listitem>
<para>
<emphasis>Best-effort</emphasis>: only pass the bytes that can be
passed immediately.
</para>
</listitem>

<listitem>
<para>
<emphasis>Conditional on availability</emphasis>:
only pass a message if the whole message can be passed immediately.
</para>
</listitem>

<listitem>
<para>
<emphasis>Timed</emphasis>: with absolute or relative time-outs.
</para>
</listitem>

</itemizedlist>
The &api; for mailboxes is given in
<filename>include/rtai_sched.h</filename> (of all places&hellip;):
<programlisting>
<![CDATA[
struct rt_mailbox {
   int magic;    // identifier for mailbox data structure
   SEM sndsem,   // semaphores to queue sending...
       rcvsem;   // ... and receiving tasks.
   RT_TASK *waiting_task, // pointer to waiting tasks
           *owndby;       // pointer to task that created mailbox
   char *bufadr;          // mailbox buffer
   int size,     // mailbox size
       fbyte,    // circular buffer first byte pointer
       lbyte,    // circular buffer last byte pointer
       avbs,     // bytes in buffer
       frbs;     // bytes free
   spinlock_t lock;       // lock to protect access to buffer
};

typedef struct rt_mailbox MBX;

int rt_typed_mbx_init(MBX *mbx, int size, int qtype);
   // Initialize a mailbox "mbx" with a buffer of "size" bytes,
   // queueing tasks according to the specified type: FIFO_Q, PRIO_Q and
   // RES_Q.

int rt_mbx_init(MBX *mbx, int size);
   // equivalent to rt_typed_mbx_init(mbx, size, PRIO_Q)

int rt_mbx_delete(MBX *mbx);
   // Delete the mailbox "mbx".

int rt_mbx_send(MBX *mbx, void *msg, int msg_size);
   // Send unconditionally, i.e. return when the whole message has
   // been received or an error occured, to the mailbox "mbx", the
   // message pointed by "msg", whose size is "msg_size" bytes.
   // Returns the number of unsent bytes.

int rt_mbx_send_wp(MBX *mbx, void *msg, int msg_size);
   // As "rt_mbx_send", but only available bytes.
   // ``_wp'' stands for: ``what possible.''

int rt_mbx_send_if(MBX *mbx, void *msg, int msg_size);
   // Send to the mailbox "mbx" only if all "msg_size" bytes
   // of "msg" can be received immediately. 
   // Returns the number of unsent bytes, i.e. either 0 or "msg_size".
   // ``_if'' stands for: ``if available.''

int rt_mbx_send_until(MBX *mbx, void *msg, int msg_size, RTIME time);
   // As "rt_mbx_send", unless the absolute time dead-line "time" 
   // is reached.

int rt_mbx_send_timed(MBX *mbx, void *msg, int msg_size, RTIME delay);
   // As "rt_mbx_send", unless the time-out "delay" has expired.

   // Similar semantics for receiving message:
int rt_mbx_receive(MBX *mbx, void *msg, int msg_size);
int rt_mbx_receive_wp(MBX *mbx, void *msg, int msg_size);
int rt_mbx_receive_if(MBX *mbx, void *msg, int msg_size);
int rt_mbx_receive_until(MBX *mbx, void *msg, int msg_size, RTIME time);
int rt_mbx_receive_timed(MBX *mbx, void *msg, int msg_size, RTIME delay);

int rt_mbx_evdrp(MBX *mbx, void *msg, int msg_size);
   // This is the ``unsafe'' version, that doesn't protect against
   // overwriting the circular message buffer.
   // The name stands for ``eventual dropping'' of data. (???)
]]>
</programlisting>
<emphasis>Typed</emphasis> mailboxes offer a functionality that is a
<emphasis>superset</emphasis> of the mailboxes above, adding the
following features:
<itemizedlist>

<listitem>
<para>
<emphasis>Message broadcasting</emphasis>: a message is sent to
<emphasis>all</emphasis> tasks that are pending on the same mailbox.
</para>
</listitem>

<listitem>
<para>
<emphasis>Priority configuration</emphasis>: a
<emphasis>urgent</emphasis> or <emphasis>normal</emphasis> wakeup policy
can be set when creating the mailbox.
</para>
</listitem>

</itemizedlist>
These features are achieved by adding a 1-byte <emphasis>type
field</emphasis> to every message inserted in a typed mailbox. So,
when receiving it is possible to discriminate normal, urgent and
broadcast messages. The type field is silently removed by the
receiving functions, so from the user point of view it is not visible.
Users must consider type fields only when specifying the types mailbox
sizes.
</para>
<para>
The &api; for typed mailboxes is given in
<filename>include/rtai_tbx.h</filename>:
<programlisting>
<![CDATA[
struct rt_typed_mailbox {
   int magic;
   int waiting_nr;     // number of tasks waiting for a broadcast
   SEM sndsmx,         // semaphores to queue sending...
       rcvsmx;         // ... and receiving tasks.
   SEM bcbsmx;         // binary semaphore needed to wakeup the
                       // sleeping tasks when the broadcasting of a
                       // message is terminated 
   RT_TASK *waiting_task;
   char *bufadr;       // mailbox buffer 
   char *bcbadr;       // broadcasting buffer 
   int size;           // mailbox size 
   int fbyte;          // circular buffer read pointer 
   int avbs;           // bytes occupied
   int frbs;           // bytes free 
   spinlock_t buflock; // lock to protect buffer access
};

typedef struct rt_typed_mailbox TBX;

   // The function prototypes are similar to normal mailboxes,
   // with "_mbx_" replaced by "_tbx_". For example:
int rt_tbx_init(TBX *tbx, int size, int type);
int rt_tbx_send(TBX *tbx, void *msg, int msg_size)
   // etc.
   // Some functions are new:
int rt_tbx_broadcast(TBX *tbx, void *msg, int msg_size);
int rt_tbx_broadcast_if(TBX *tbx, void *msg, int msg_size);
int rt_tbx_broadcast_until(TBX *tbx, void *msg, int msg_size, RTIME time);
int rt_tbx_broadcast_timed(TBX *tbx, void *msg, int msg_size, RTIME delay);

int rt_tbx_urgent(TBX *tbx, void *msg, int msg_size);
int rt_tbx_urgent_if(TBX *tbx, void *msg, int msg_size);
int rt_tbx_urgent_until(TBX *tbx, void *msg, int msg_size, RTIME time);
int rt_tbx_urgent_timed(TBX *tbx, void *msg, int msg_size, RTIME delay);
]]>
</programlisting>
The <emphasis>unconditional</emphasis> versions of mailbox
communication correspond to
<emphasis>synchronous message passing<indexterm>
<primary>synchronous message passing</primary></indexterm></emphasis>.
<indexterm>
 <primary>message</primary><secondary>synchronous passing</secondary>
</indexterm>
</para>

</sect2>


<sect2 id="rtai-mq">
<title>POSIX message queues</title>
<para>
<indexterm><primary>message queue</primary></indexterm>
<indexterm>
 <primary>&ipc;</primary><secondary>&posix; message queue</secondary>
</indexterm>
<indexterm>
 <primary>&posix;</primary><secondary>message queue</secondary>
</indexterm>
&rtai; supports standard &posix; message queues
(<xref linkend="sect-mess">).
</para>

</sect2>


<sect2 id="rtai-fifo">
<title>FIFO</title>
<para>
<indexterm><primary>&fifo;</primary></indexterm>
<indexterm>
 <primary>&ipc;</primary><secondary>&fifo;</secondary>
</indexterm>
&fifo;s are a basic &ipc; data exchange primitive, and well supported
under &rtai;. It offers an &api; for kernel space &fifo;s, and one for
user space &fifo;s:
<programlisting>
<![CDATA[
struct rt_fifo_info_struct{
   unsigned int fifo_number;
   unsigned int size;
   unsigned int opncnt;
   char name[RTF_NAMELEN+1];
};

struct rt_fifo_get_info_struct{
   unsigned int fifo;
   unsigned int n;
   struct rt_fifo_info_struct *ptr;
};

   // initialize FIFO data structure:
int rtf_init(void);

/* Attach a handler to an RT-FIFO.
 *
 * Allow function handler to be called when a user process reads or
 * writes to 
 * the FIFO. When the function is called, it is passed the fifo number
 * as the 
 * argument.
 */

extern int rtf_create_handler(unsigned int fifo,   /* RT-FIFO */
      int (*handler)(unsigned int fifo)   /* function to be called
*/);


]]>
</programlisting>
</para>
<para>
Here is the skeleton of a user space task and a hard real-time task,
that use a &fifo; to communicate; the other &ipc; primitives use similar
skeletons.
<programlistingco>
<areaspec>
<area id="fifo0"  coords="8">
<area id="fifo1"  coords="12">
<area id="write"  coords="16">
<area id="read"   coords="18">
<area id="fifor0" coords="34">
<area id="fifor1" coords="35">
</areaspec>
<programlisting>
<![CDATA[
// user space task:
int main(int argc,char *argv[])
{
  int rtf, cmd;
  int data[...];
  double ddata[...];
  ...
  if ((rtf = open("/dev/rtf0", O_RDONLY)) < 0) {
          fprintf(stderr, "Error opening /dev/rtf0\n");
          exit(1);
  }
  if ((cmd = open("/dev/rtf1", O_WRONLY)) < 0) {
          fprintf(stderr, "Error opening /dev/rtf1\n");
          exit(1);
  }
  while (...) {
    write(cmd, &data, ...);
    ...
    read(rtf, &ddata, ...);
    ...
  };
  ...
  return 0;
}

// module that creates hard real-time task:
#define RTF 0
#define CMD 1

static RT_TASK mytask;

int init_module(void)
{
        rtf_create(RTF, 4000);
        rtf_create(CMD, 100);
        rt_task_init(&mytask, fun, 0, STACK_SIZE, 0, 1, 0);
        rt_set_runnable_on_cpus(&mytask, ...);
        rt_assign_irq_to_cpu(TIMER_8254_IRQ, TIMER_TO_CPU);
        rt_linux_use_fpu(1);
        now = rt_get_time();
        rt_task_make_periodic(&mytask, now + 2000, ...);
        return 0;
}

// function run in real-time task:
static void fun(int t) {
  ...
  while (...) {
    cpu_used[hard_cpu_id()]++;
    rtf_put(RTF, ..., ...);
    rtf_get(CMD, ..., ...):
    rt_task_wait_period();
  }
}
]]>
</programlisting>
<calloutlist>
<callout arearefs="fifo0">
<para>
Opens first &fifo; as a user space device.
</para>
</callout>
<callout arearefs="fifo1">
<para>
Opens second &fifo; as a user space device.
</para>
</callout>
<callout arearefs="write">
<para>
Writes data in the &fifo;.
</para>
</callout>
<callout arearefs="read">
<para>
Reads data from the &fifo;.
</para>
</callout>
</calloutlist>
</programlistingco>
One can add a handler to a &fifo;, via
<function>rtf_create_handler()</function>. One can also send a signal
to notify data availability, via <function>rtf_set_async_sig(int fd,
int signum)</function>. This handler and signal functionality is not
available for the other &ipc; primitives.
</para>

</sect2>


<sect2 id="rtai-rpc">
<title>RPC</title>
<para>
<indexterm><primary>&rpc;</primary></indexterm>
<indexterm>
 <primary>&ipc;</primary><secondary>&rpc;</secondary>
</indexterm>
&rtai; supports Remote Procedure Calls, <xref linkend="sect-mess">.
(Even over a network, In which case the user is responsible for
using appropriate hardware, of course.
This text skips the details of this latter functionality, because it falls
outside of the scope of hard real-time systems.) The on-system &rpc;
in &rtai; works as a &ldquo;send/receive&rdquo; message pair:
a task sends a four-byte message to another task, and then waits until
a reply is received. The caller task is always blocked and queued up.
Calling this a &ldquo;Remote Procedure Call&rdquo; is a bit ambitious:
the communicating tasks just send four bytes, and they have to agree
on a protocol that defines the <emphasis>meaning</emphasis> of these
four bytes, and whether or not the message triggers the execution of a
procedure call at the receiver's end.
The &api; for this form of &rpc; is:
<programlisting>
<![CDATA[
RT_TASK *rt_rpc(
  RT_TASK *task,
  unsigned int to_do,
  unsigned int *reply);
   // The receiver task may get the message with any "rt_receive_*"
   // function. It can send the answer with "rt_return()".
   // "reply" points to a buffer provided by the caller.

RT_TASK *rt_return(
  RT_TASK *task,
  unsigned int reply);

RT_TASK *rt_rpc_if(
  RT_TASK *task,
  unsigned int to_do,
  unsigned int *result);

RT_TASK *rt_rpc_until(
  RT_TASK *task,
  unsigned int to_do,
  unsigned int *result,
  RTIME time);

RT_TASK *rt_rpc_timed(
  RT_TASK *task,
  unsigned int to_do,
  unsigned int *result,
  RTIME delay);

int rt_isrpc(RT_TASK *task);
   // After receiving a message, by calling "rt_isrpc" a task 
   // can find out whether the sender task "task" is waiting for
   // a reply or not.
   // "rt_return" is intelligent enough to not send an answer to 
   // a task which is not waiting for it. Therefore using "rt_isrpc"
   // is not  necessary and discouraged.
]]>
</programlisting>
The meaning of the suffixes &ldquo;<parameter>_if</parameter>&rdquo;,
&ldquo;<parameter>_until</parameter>&rdquo;, and 
&ldquo;<parameter>_timed</parameter>&rdquo; is as in the &api;s of
messages and mailboxes.
</para>

</sect2>


</sect1>


<sect1 id="rtai-mm">
<title>Memory management</title>
<para>
Shared memory implementation in
<filename class=directory>shmem</filename>.
Again symmetric. Dynamic memory management;
</para>
<para>
(TODO: more details.)
</para>

</sect1>


<sect1 id="rtai-devdrv">
<title>Real-time device drivers</title>
<para>
spdrv, rtnet, plus strong integration with &comedi;.
</para>
<para>
(TODO: more details.)
</para>

</sect1>


<sect1 id="rtai-proc">
<title><filename class=directory>/proc</filename> interface</title>
<para>
The <filename class=directory>/proc</filename> interface
is an extension to the standard &linux;
<filename class=directory>/proc</filename> interface feature:
files under the subdirectory <filename
class="directory">/proc/rtai</filename> give status and debug
information of the currently active &rtai; modules. These files are
activated when the associated module is inserted into the kernel.
</para>
<para>
<filename class=directory>/proc</filename> interface code can be
found in most &rtai; source files. It's a non real-time feature
(hence, only to be used by normal user space tasks), but it requires
support from the real-time kernel; this support is implemented again
via <emphasis>traps</emphasis>.
</para>

</sect1>


<sect1 id="rtai-modules">
<title>RTAI loadable modules</title>
<para>
&rtai;'s functionality is made available by dynamically
<emphasis>loading modules</emphasis> into the running (and patched)
&linux; kernel. Every module extends the &api; of the kernel with some
new &ldquo;objects&rdquo; (i.e., function calls and data structures).
Not all modules are needed in all cases, but, vice versa, 
dependencies exist between modules, i.e., in order to use
functionality in one module, one often also needs to load other
modules first.
</para>
<para>
<function>rtai</function> core module <filename>rtai.c</filename>, and
made in <filename class=directory>rtaidir</filename>.
</para>
<para>
Scheduler module <filename>ABCscheduler/rtai_sched.c</filename>.
</para>
<para>
Tasklet module: allocates and initializes the data structures for the
tasklet and timer queues; starts the
<function>timers_manager</function> task, that is responsible for the
execution of the timers;
</para>
<para>
Scheduler module <filename>&hellip;</filename>.
</para>
<para>
Extra scheduler module <filename>&hellip;</filename>.
</para>
<para>
&rtai; utilities module <filename>&hellip;</filename>.
</para>
<para>
Types mailboxes module <filename>&hellip;</filename>.
</para>
<para>
pthreads module <filename>&hellip;</filename>.
</para>
<para>
Memory manager module <filename>&hellip;</filename>.
</para>
<para>
&fifo;s module <filename>fifos/rtai_fifos.c</filename>.
</para>
<para>
&lxrt; module <filename>lxrt/lxrt.c</filename>.
</para>
<para>
Serial line module <filename>spdrv/rtai_spdrv.c</filename>.
</para>
<para>
&cpp; module <filename>&hellip;</filename>.
</para>
<para>
Network RPC module <filename>net_rpc/net_rpc.c</filename>.
</para>
<para>
Tracing module <filename>trace/rtai_trace.c</filename>.
</para>
<para>
Watchdog module <filename>watchdog/rtai_watchdog.c</filename>.
</para>
<para>
Bits module <filename>bits/rtai_bits.c</filename>.
</para>
<para>
(TODO: explain contents of the different &rtai; modules; dependencies:
what must be loaded in order to use the different functionalities
mentioned above?)
</para>

</sect1>


<sect1 id="rtai-specific">
<title>Specific features</title>
<para>
&rtai; has developed a number of features that common real-time
operating systems miss:
<itemizedlist>

<listitem>
<para>
&lxrt; is the component that allows user space tasks to execute soft
and hard real-time functions. Because this feature is quite
extensive, section <xref linkend="rtai-lxrt"> gives more details.
</para>
</listitem>

<listitem>
<para>
Dynamic memory allocation, also by real-time tasks.
(TODO: give details.)
</para>
</listitem>

<listitem>
<para>
Integration of the
<ulink
 url="http://www.opersys.com/LTT/index.html">Linux Trace Toolkit</ulink>,
which allows to trace (i.e., log to a buffer) a large number of
activities from the kernel: interrupts, scheduling, creation of tasks,
etc.
(TODO: give details.)
</para>
</listitem>

<listitem>
<para>
&cpp; support, <xref linkend="cpp">. 
</para>
</listitem>

</itemizedlist>
</para>

</sect1>

</chapter>



<chapter id="rt-linux-variants">
<title>Linux-based real-time and embedded operating systems</title>
<para> 
This Chapter presents &ldquo;spin-offs&rdquo; of the standard &linux;
kernel that provide hard real-time performance, or that are targeted to
embedded use.
</para>

<sect1>
<title>Introduction</title>
<para>
There are two major developments at the &rtos; level: &rtlinux; and
&rtai;. &rtai; forked off an earlier version of &rtlinux;.  &rtlinux;
and  &rtai; do basically the same thing (and do it with industrial
strenght quality, except maybe for documentation&hellip;), they make
their sources available, they have partial &posix; compliance, but
they don't use compatible &api;s.
In the
<emphasis>embedded</emphasis> (but non real-time) &linux; world,
projects have emerged, such as &uclinux;, and &etlinux;.
But probably standard &linux; is the major workhorse here, thanks to
its great configurability.
</para>

</sect1>


<sect1 id="sect-rtlinux">
<title>&rtlinux;: Real-Time Linux</title>
<para>
<ulink url="http://www.rtlinux.com">&rtlinux;</ulink> is a patch
for the standard &linux; kernel (often called the &ldquo;vanilla&rdquo;
&linux; kernel), for single as well as for multi-processor
kernels. It offers all components of a hard real-time
system in a multi-threaded real-time kernel, in which
standard &linux; is the lowest-priority thread. One advantage (or
disadvantage, depending on your taste) of this approach is that
real-time space and &linux; space (both kernel space and user space)
are strictly separated: programmers have to specify explicitly which
of their tasks should run with real-time capabilities, and which
others should not.
This separation also relieves the real-time kernel from
&ldquo;bookkeeping&rdquo; tasks such as booting, device
initialization, module (un)loading, or dynamic memory
allocation. None of these have real-time constraints, hence they
naturally belong to &linux; and not &rtlinux;.
From programming point of view, most, but not all, functionality,
habits and tools of &linux; remain available at no cost,
<emphasis>and</emphasis> the real-time application can run and be
debugged on the same computer on which it is developed, without the
need for cross-compilation tools. This makes &ldquo;migration&rdquo;
for &linux; users quite painless. 
</para>

<para>
The disadvantage of a distribution in the form of a kernel patch is that
this patch has (i) to be maintained (by the &rtlinux; developers) over
evolving kernel versions, and (ii) applied (by the users) each time
they upgrade their kernel.  Pre-patched versions of some kernel
versions are available from the &rtlinux; web page.
The &rtlinux; patch is minor: it provides a &ldquo;virtual
interrupt&rdquo; emulation to standard &linux;, and offers a
kernel space micro-kernel with real-time scheduled threads.
&rtlinux; intercepts all hardware interrupts, checks whether an
interrupt is destined for a real-time service routine (and launches
the corresponding &isr; if it is), or forwards them to &linux; in
the form of a virtual interrupt, which is held until no real-time
activity must run. In this scheme, &linux; is never able to disable
<emphasis>hardware</emphasis> interrupts.
</para>
<para>
&rtlinux; comes (after compilation) as a set of loadable modules
within &linux;: the core module with the above-mentioned interrupt
controller handler, a real-time scheduler (with static priorities), a
timer module, a &fifo; implementation, shared memory and most
real-time lock and event primitives. This modularity makes
customization easier, and increases the embeddability (because
unnecessary modules need not be loaded).
</para>


<sect2>
<title>Functionalities</title>
<para>
&rtlinux; offers basic &posix; compliance: it has implemented the
<emphasis>Minimal Realtime System Profile</emphasis> (&posix; 1001.13,
<acronym>PSE51</acronym>). This means that it has basic thread
management, &ipc; primitives, and 
<function>open</function>/<function>read</function>/
<function>write</function>/&hellip; function calls, but only for basic
<emphasis>device</emphasis> I/O rather than full <emphasis>file
system</emphasis> support.
&rtlinux; has support for mutexes, condition variables, semaphores,
signals, spinlocks, and FIFOs. It implements some form of
<emphasis>user space real time</emphasis>, based on the signal
mechanism. &rtlinux; tasks can communicate with &linux; tasks, with
the guarantee that this &ipc; is <emphasis>never</emphasis> blocking
at the &rtlinux; side.
</para>

<para>
Some function calls do not follow the &posix; standard; these are
named <function>pthread_&hellip;_np()</function>, where the
&ldquo;<function>np</function>&rdquo; stands for
&ldquo;non-portable.&rdquo;<indexterm>
<primary>&posix;</primary>
<secondary>non-portable</secondary>
</indexterm>
This behaviour of adding
&ldquo;<function>pthread_&hellip;_np()</function> functions in a
&posix;-compatible operating system is explicitly allowed by the
&posix; standard. &rtlinux; uses this behaviour, but none of its core
functionality depends on it.
</para>

</sect2>


<sect2 id="sec-minirtl">
<title>MiniRTL</title>
<para>
<indexterm><primary>MiniRTL</primary></indexterm>
<ulink
 url="http://www.thinkingnerds.com/projects/minirtl/minirtl.html">miniRTL
</ulink>
is a (not actively maintained) sub-project of &rtlinux; that offers a
small-sized real-time &linux; that is small enough to boot from a
single floppy (or small Flash memory device) into a ramdisk, yet 
offers the most important features of &linux;. miniRTL is intended to
be useful as the basis for embedded systems, but also provides a means
for real-time &ldquo;newbies&rdquo; (or non-&linux; users) to learn
more about real-time &linux;.
</para>

</sect2>


<sect2 id="sec-patent-rtlinux">
<title>The RTLinux patent</title>
<para>
&rtlinux; has matured significantly over three major versions of
&rtlinux;, and, since the 3.0 release, not many &api; changes have
occurred. This is partially due to the carefully conservative policy
of &rtlinux; maintainer Victor Yodaiken, but partially also to the
fact that &rtlinux; started with a closed-source, proprietary,
patent-protected version. That means that there are two branches of
&rtlinux;: &rtlinux;/GPL (free software), and &rtlinux;/PRO (non-free
software, where most of the developments and hardware ports are taking
place). The start of such a closed-cource branch was possible, because
Yodaiken didn't include contributions in the &rtlinux; core with (GPL)
copyrights of other contributors than &fsmlabs;. This move was not too
well appreciated in the free software community, but was practically
inevitable in order to build a business around &rtlinux; development.
The support for, and response to, users of the &gpl;-ed version has
drastically been reduced.
</para>
<para><indexterm>
<primary>&rtlinux; patent</primary></indexterm>
<indexterm>
<primary>patent</primary><secondary>&rtlinux;</secondary>
</indexterm>
The &rtlinux; approach is covered by US 
<ulink
 url="http://www.patents.ibm.com/details?pn=US05995745__">Patent 5995745
</ulink>, issued on November 30, 1999. &rtlinux; comes with a
remarkable license for using this patent (see the file
<filename>PATENT</filename> in the source distribution of &rtlinux;).
The following is an excerpt from that patent license file:
</para>
<para>
<blockquote><attribution>THE OPEN RTLINUX PATENT LICENSE</attribution>
<para>
&hellip;
</para>
<para>
The Patented Process may be used, without any payment of a royalty,
with two (2) types of software. The first type is software that
operates under the terms of a GPL (as defined later in this License).
The second type is software operating under Finite State Machine Labs
Open RTLinux (as defined below). As long as the Licensee complies with
the terms and conditions of this License and, where applicable, with
the terms of the GPL, the Licensee may continue to use the Patented
Process without paying a royalty for its use.
</para>
<para>
&hellip;
</para>
</blockquote>
</para>
<para>
With this patent, &fsmlabs; tries to find a balance between stimulating
development under the &gpl; on the one hand, and generating a business
income from real-time operating system development and service on the
other hand.
This patent is (at the time of this writing) not valid outside of the
USA. &fsmlabs; has expressed its intention to <emphasis>enforce</emphasis>
the patent, which has led to very strong reactions in the free software
community. One of these reactions has been  the development of an
alternative approach, free of the patent claims
(see <xref linkend="sect-adeos">); another reaction is the massive
transition of community development efforts towards &rtai;.
</para>

</sect2>

</sect1>

<sect1 id="sect-rtai">
<title>&rtai;: the Real-Time Application Interface</title>
<para>
<ulink url="http://www.rtai.org">&rtai;</ulink> has its 
origin in &rtlinux;, when main developer Paolo Mantegazza wanted to
bring his work and experiences with real-time on DOS to &linux;. The
&ldquo;schism&rdquo; from &rtlinux; that gave birth to &rtai; occurred
quite early on in the history of &rtlinux;, when Mantegazza wanted
some features for his own work (e.g., multi-processor support) that
did not exist in &rtlinux;, and in which the &rtlinux; developers
showed no interest.
The &api;s of &rtlinux; and &rtai; are similar (both are &rtos;s
anyway), but not trivially exchangeable. And they become even more and
more distinct over time. They do, however, support about the same set
of &posix; primitives.
</para>
<para>
&rtai; is more of a &ldquo;bazaar&rdquo;-like project than &rtlinux;,
in the sense that it happily accepts contributions from anybody,
without sticking to a strict design vision, or code tree and
documentation discipline. In that sense it responds better to user
requests, evolves rapidly, but possibly at the price of giving a
chaotic impression to new users. Anyway, it has succeeded in
attracking almost all community development efforts in the area of
real-time for &linux;, at the expense of the &rtlinux; project.
</para>
<para>
This document takes &rtai; as an example &rtos; to investigate in more
technical details in <xref linkend="chap-rtai">. The following
sections give some information about an important non-technical aspect
of &rtai;: its relationship with the &rtlinux; patent
(<xref linkend="sec-patent-rtlinux">).
</para>

<sect2 id="rtai-patent">
<title>RTAI and the RTLinux patent</title>
<para>
&rtlinux;'s owner &fsmlabs; has done little to clear up the uncertainty
around the legal repercussions of its patent, which could scare away
potential commercial interest in &rtai;. However, the &rtai; community
has been able to clear up matters, in different ways:
<orderedlist>

<listitem>
<para>
The license  of the &rtai; core changed from &lgpl; to &gpl;, so that
it complies with the patent.
</para>
</listitem>

<listitem>
<para>
Eben Moglen, Professor of law at Columbia University, and legal
adviser to the
<ulink url="http://www.fsf.org">Free Software Foundation</ulink>,
published a legal study, that concludes that the patent is not
enforceable on applications done with &rtai;. A
<ulink
 url="http://www.aero.polimi.it/~rtai/documentation/articles/moglen.html">summary
of his study</ulink> can be read at the &rtai; homepage.
</para>
</listitem>

<listitem>
<para>
Karim Yaghmour's rebuttal of the &rtlinux; patent rights. Basically,
the patent was submitted too long after the patented ideas were
published and available in code form. The details can be found in his
<ulink
 url="http://www2.fsmlabs.com/mailing_list/rtl.w5archive/advocacy-0204/msg00042.html">&ldquo;Check Mate&rdquo;</ulink>
posting on the &rtlinux; advocacy mailing list.
</para>
</listitem>

<listitem>
<para>
<emphasis>&adeos;<indexterm>
<primary>&adeos;</primary></indexterm>.</emphasis>
(See <xref linkend="sect-adeos"> for more technical detail.) This is a
nano-kernel, that offers an alternative to the patented concept of
&rtlinux;.
At the time of writing, &adeos; has not yet been accepted as the real
core of &rtai;, but several positive testing and porting signs emerge
from the community.
</para>
</listitem>

<listitem>
<para>
&rtai; has introduced additions to normal &linux; task management and
scheduling, that offer the functionality to schedule
<emphasis>user space</emphasis> tasks with hard real-time determinism
(<xref linkend="rtai-lxrt">).
And &linux; user space applications are not within the scope of the
patent's claims.
</para>
</listitem>

</orderedlist>
</para>

</sect2>

</sect1>


<sect1 id="sect-uclinux">
<title>&uclinux;</title>
<para>
<ulink url="http://www.uclinux.org">&uclinux;</ulink>:
for &mmu;-less processors;
small footprint (about 500 to 900 kB);
full TCP/IP stack; support for various file systems. Has real-time
functionality too.
An introduction to &uclinux; can be found
<ulink url="http://www.snapgear.com/tb20020807.html">here</ulink>.
</para>

</sect1>

<sect1 id="sect-etlinux">
<title>&etlinux;</title>
<para>
<ulink url="http://www.prosa.it/etlinux/">&etlinux;</ulink>
is a complete &linux;-based system designed to run on very small
industrial computers, such as i386 and PC/104 modules with not more
than 2 Megabytes of RAM.
</para>

</sect1>

</chapter>


<chapter id="nonlinux">
<title>Non-Linux real-time operating systems</title>

<para> 
There are many application areas where using a &linux; kernel is not a
good idea, because of memory footprint, feature bloat, licensing and
patent issues, processor support, etc. Moreover, &linux; was certainly
not the first free software operating system, particularly not in the
area of real-time.  This chapter points out some of the non-&linux;
alternatives that are available under free software licenses.  From
them, &ecos; has probably been the most successful in gathering a
large user and development community.
</para>


<sect1 id="sect-adeos">
<title>The &adeos; nano-kernel</title>
<para>
The
<emphasis>Adaptive Domain Environment for Operating Systems</emphasis>
(<ulink
 url="">&adeos;<indexterm>
<primary>&adeos;</primary></indexterm></ulink>)
is not really an (RT)OS in itself, but a software layer between the
hardware interrupts and the operating system. Or rather, between the
hardware and the <emphasis>various</emphasis> operating systems that
can run on top of it. Indeed, &adeos; is capable of
&ldquo;hosting&rdquo; more than one OS on top of it, and these OSs
don't know about each other, as long as they ask &adeos; to pass
through the interrupts they need.
</para>

<para>
The &adeos; design was done by
Karim Yaghmour<indexterm><primary>Yaghmour, Karim</primary></indexterm>,
because he wanted to find a way to avoid the &fsmlabs; patent
(<xref linkend="sec-patent-rtlinux">)
on the real-time &linux; approach. The idea is not really new, because
Yaghmour found references from the early 90s. Philippe
Gerum<indexterm><primary>Gerum, Philippe</primary></indexterm> did
most of the work in implementing the idea into a working piece of
code. (Philippe also has complementary Free Software projects:
<ulink
 url="http://freesoftware.fsf.org/projects/xenomai/">Xenomai
</ulink> and
<ulink
 url="http://freesoftware.fsf.org/projects/carbonkernel/"> CarbonKernel 
</ulink>,
respectively aimed at real-time operating systems emulation and
simulation.)
</para>

<para>
The following text is a copy from the <filename>README</filename> file
of the &adeos; code tarball:
&ldquo;<emphasis>To share the hardware among the different OSes, Adeos
implements an
interrupt pipeline (ipipe). Every OS domain has an entry in the ipipe.
Each interrupt that comes in the ipipe is passed on to every domain
in the ipipe. Instead of disabling/enabling interrupts, each domain
in the pipeline only needs to stall/unstall his pipeline stage. If
an ipipe stage is stalled, then the interrupts do not progress in the
ipipe until that stage has been unstalled. Each stage of the ipipe
can, of course, decide to do a number of things with an interrupt.
Among other things, it can decide that it's the last recipient of the
interrupt. In that case, the ipipe does not propagate the interrupt
to the rest of the domains in the ipipe.</emphasis>.&rdquo;
</para>

</sect1>


<sect1 id="sect-ecos">
<title>&ecos;</title>
<para>
(TODO: more details)
</para>

<para>
<ulink url="http://sources.redhat.com/ecos/">&ecos;</ulink>
scheduler: fast and deterministic, deals with priority inversion, but
not optimally; offers &uitron;, &posix; and &osek; &api;s and a
non-standard &api; that shows its roots in the
<acronym>Cygnus</acronym> company
(&ldquo;<function>cyg_scheduler_start()</function>&rdquo; etc.).
&dsr;: interrupts enabled but scheduling disabled. No kernel space/user
space distinction. No development on same machine. Board support
packages for a lot of processors, many of them embedded processors.
</para>
<para>
&ecos; has a quite turbulent history.  &redhat; acquires Cygnus in
1998 releasing their embedded operating systems efforts under the
&ecos; name, but fires its &ecos; development team in June 2002.
Development was taken over by 
<ulink url="http://www.ecoscentric.com/">eCos>entric</ulink>.
The license also changed over time, with the version 2.0 released under
what is largely the &gpl;, with &ldquo;guarantees&rdquo; for
compatibility with closed-source commercial components.
</para>

</sect1>


<sect1 id="sect-rtems">
<title>&rtems;</title>
<para>
The origins of
<ulink url="http://www.rtems.com/">&rtems;</ulink> lie with the
Department of Defense in the USA, that wanted an &ada;-based
&ldquo;Real-time Executive for Missile Systems.&rdquo;
This became the &ldquo;Real-time Executive for Military
Systems,&rdquo; when they realised its relevance beyond missile
control, and the &ccc; version later became the
&ldquo;Real-Time Executive for Multiprocessor Systems.&rdquo;
The &ada; version keeps the &ldquo;M&rdquo; of &ldquo;military.&rdquo;
</para>
<para>
&rtems; has a &posix; POSIX 1003.1b &api; (under construction);
multitasking for homogeneous and heterogeneous multiprocessor systems;
an event-driven, priority-based, preemptive scheduling; optional rate
monotonic scheduling; intertask communication and synchronization;
priority inheritance; responsive interrupt management; dynamic memory
allocation; and it is compatible with the &gnu; tools.
</para>
<para>
(TODO: more details)
</para>

</sect1>


<sect1 id="sect-jaluna">
<title>Jaluna</title>
<para>
<emphasis>Jaluna</emphasis>
(<ulink url="http://www.jaluna.com">Jaluna<indexterm>
<primary>Jaluna</primary></indexterm></ulink>)
is an RTOS plus development environment released under a free software
license in 2002. Jaluna is based on <acronym>C5</acronym>, the 5th
generation of <acronym>Sun</acronym> Microsystems'
<acronym>ChorusOS</acronym> product.
</para>

</sect1>

<sect1 id="sect-wonka">
<title>Wonka + Oswald</title>
<para>
<ulink url="http://wonka.acunia.com">Wonka</ulink> is a free software
<emphasis>Virtual Machine</emphasis> for &java;, with a real-time
executive <emphasis>OSwald</emphasis>.
</sect1>

<!--
<sect1 id="sect-cubeos">
<title>&cubeos;</title>
<para>
 <ulink url="http://www.cubeos.org">&cubeos;</ulink> is an embedded
real-time operating system for <acronym>MC68332</acronym> &cpu;s. It
was developed in the PhD thesis of Holger Kenn,
for use in small mobile robots.
</para>
</sect1>
-->


<sect1 id="sect-fiasco">
<title>&fiasco; and &drops;</title>
<para>
<ulink url="http://os.inf.tu-dresden.de/fiasco">&fiasco;</ulink>
is a (for the time being academic)
micro-kernel<indexterm><primary>micro-kernel</primary></indexterm>
running on x86 &cpu;s. It is a pre-emptable real-time kernel supporting
hard priorities. It uses non-blocking synchronization for its kernel
objects, guarantees priority inheritance, and makes sure that runnable
high-priority processes never block waiting for lower-priority
processes.
&fiasco; is used as the kernel of the real-time operating system
<ulink url="http://os.inf.tu-dresden.de/drops/">&drops;</ulink>,
thats want to bring <emphasis>Quality of Service<indexterm>
<primary>&qos;</primary></indexterm>
<indexterm><primary>Quality of Service</primary></indexterm>
</emphasis>
to real-time operating systems.
</para>
</sect1>


<sect1 id="sect-rtmk">
<title>Real-time micro-kernel</title>
<para>
The
<ulink url="http://rtmk.sourceforge.net/">Real-time micro-kernel</ulink>
is inspired by the Mach micro-kernel, but is also meant for embedded
systems. 
</para>
</sect1>

<sect1 id="sect-kiss">
<title>KISS Realtime Kernel</title>
<para>
The <ulink url="http://kiss.sourceforge.net/">KISS Embedded
Realtime Kernel</ulink>
is an academic project, intended for use in deeply embedded
applications such as cell phones, cars, VCRs, consumer electronics,
microwave ovens, toasters and ballistic intercontinental nuclear
missiles. Being deterministic, small, readable and understandable, it
is suitable for applications where deterministic response is
primordial.
The kernel also provides <emphasis>resource tracking<indexterm>
<primary>resource tracking</primary></indexterm></emphasis>:
should an application terminate unexpectedly, all resources it had
allocated are released.
</para>
</sect1>

</chapter>

</part>

<!-- =====================P=A=R=T==II============================== -->

<part id="part2">
<title>RTOS implementation</title>

<partintro>
<para> 
This Part leaves the terrain of general concepts, and digs a bit
deeper into implementation aspects of real-time operating systems. The
&rtai; operating system is taken as an illustration of a hard
real-time operating system, and its implementation is explained in
some more detail.
</para>
</partintro>


<chapter id="chap-rtai-impl">
<title>RTAI: the implementation</title>

<para>
(TODO: lots of things. Most sections are not decently structured, and
their contents not decently checked&hellip;)
</para>
<para>
This Chapter describes the <emphasis>implementation</emphasis> of the
three basic parts of &rtai;: the hardware abstraction layer (RTHAL),
the core of real-time task scheduling, and the &ldquo;user space real
time&rdquo; &lxrt;.  The reader learns how &rtai; can be a hard
real-time kernel, while still allowing &linux; to function &ldquo;as
usual&rdquo; on the same hardware.  The discussion doesn't go into the
deepest detail of the code however, but aims at offering the
appropriate trade-off between detail and generality, to help the
interested reader to quickly understand the (not extensively
documented) &rtai; source code, and to be able to place it in the
wider context of (real-time) operating systems.
</para>


<sect1 id="rtai-sourcetree">
<title>The RTAI source tree</title>
<para>
The &rtai; source code tree doesn't reflect the subdivision into the
major components presented in the previous Chapter: the Hardware
Abstraction Layer, the &linux; compatibility layer, the core
functionality, &lxrt;, and the extended functionality packages.
So, finding where a particular feature is implemented
can be time consuming. Part of the code, of course, contains
hardware-dependent code, which contain the basis for the first three
&rtai; parts mentioned above.  This code is concentrated in the
following three directories (all directories given in this Chapter are
with respect to the &ldquo;root&rdquo; directory of the &rtai; source
tree; or the &linux; source tree, whenever applicable):
<itemizedlist>

<listitem>
<para>
<filename class=directory>patches/</filename>: this directory
contains the &linux; kernel patch, which is available for different
&linux; kernel versions and for different hardware architectures.
The contents of the &rtai; patch tend to change slightly from release
to release, because of (i) a growing number of supported &rtos;
features that need low-level support; (ii) &linux; itself evolving in
the direction of offering a cleaner HAL, so eliminating the need for
some parts of earlier patches; and (iii) code optimizations. It
is necessary to apply the correct version of the patch to a
<emphasis>clean</emphasis> &linux; kernel of the corresponding
version. And be aware that kernels that come with many &linux;
distributions have already been patched by the distributor for various
reasons, so that patching it once more with the &rtai; patch could
fail.
</para>
</listitem>

<listitem>
<para>
<filename class=directory>include/asm-xyz/</filename>, with
<filename>xyz</filename> the identifier for a particular hardware; for
example, <filename>i386</filename>, <filename>arm</filename>, or
<filename>ppc</filename>. The header files in these directories
also contain some code, often in the form of assembler in inlined
function definitions.
</para>
</listitem>

<listitem>
<para>
<filename class=directory>arch/xyz/</filename>, with
<filename>xyz</filename> the identifier for a particular hardware.
These directories, together with the above-mentioned header files,
implement the hardware-dependent parts of &rtai;'s functionality.
</para>
</listitem>

</itemizedlist>
The state of the &rtai; source tree at the time of writing is such
that is doesn't have clearly separated code trees for different stable
and unstable versions. Hence, one sometimes finds different versions
of a file in the same directory. For example,
<filename class=directory>include/asm-i386</filename> contains 
several versions of <filename>rtai.c</filename>, with names such as:
<filename>rtai.c</filename> (stable version),
<filename>allsoft.c</filename> (experimental version),
<filename>rtai-22.c</filename> (version for 2.2.x &linux; kernels).
The <emphasis>configuration</emphasis> scripts of &rtai; choose the
version that corresponds to your configuration selection, and copy
them to the &ldquo;official&rdquo; filenames (which is
<filename>rtai.c</filename> in the example above).  &linux;
configuration, by the way, follows a similar approach.  The &rtai;
patch also contains adaptation to the &linux; configuration settings,
such that existing &linux; configuration tools can be used. For
example, <application>xconfig</application> or
<application>menuconfig</application>.
</para>
<para>
One of the most important files in the &rtai; source tree is the
<emphasis>patch<indexterm>
<primary>patch</primary></indexterm></emphasis> to the &linux; source
tree. The patch modifies the &linux; kernel, in order to place the
&ldquo;hooks&rdquo; to which the &rtai; functionality is attached.
Such a patch file is in &ldquo;diff&rdquo; form (see the
<command>diff</command> man page). That means that it lists only the
<emphasis>differences</emphasis> between the original &linux; source
files and the adapted &rtai; versions of these same files. This allows
to keep the patch file small (far below 50 kilobytes) and to get a
good and complete overview of the changes that &rtai; applies.  The
diff file contains patches to different &linux; files, each patch
being of the following form (the markers at the end of lines are added
for annotation purposes only):
<programlistingco>
<areaspec>
<areaset id="command" coords="">
   <area id="command.1" coords=1>
   <area id="command.2" coords=2>
</areaset>
<areaset id="files" coords="">
   <area id="files.1" coords=3>
   <area id="files.2" coords=4>
</areaset>
<area id="lines" coords=5>
<areaset id="context" coords="">
   <area id="context.1" coords=6>
   <area id="context.2" coords=7>
   <area id="context.3" coords=8>
   <area id="context.4" coords=11>
   <area id="context.5" coords=12>
   <area id="context.6" coords=13>
</areaset>
<area id="minus" coords=9>
<area id="plus" coords=10>
</areaspec>
<programlisting>
<![CDATA[
diff -urN -X kernel-patches/dontdiff linux-2.4.18/Makefile
linux-2.4.18-rthal5/Makefile
--- linux-2.4.18/Makefile    Mon Feb 25 20:37:52 2002
+++ linux-2.4.18-rthal5/Makefile  Tue Feb 26 09:52:01 2002
@@ -1,7 +1,7 @@
 VERSION = 2 
 PATCHLEVEL = 4
 SUBLEVEL = 18
-EXTRAVERSION =
+EXTRAVERSION = -rthal5
  
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
  
]]>
</programlisting>
<calloutlist>
<callout arearefs="command">
<para>
This line shows the <command>diff</command> command that has produced
the patch.
</para>
</callout>
<callout arearefs="files">
<para>
These are the files in two different directories whose
<command>diff</command> is shown. One file is identified with
minus signs, the other with plus signs.
</para>
</callout>
<callout arearefs="lines">
<para>
These are the line numbers, for both files, that the following part of
the patch has changed.
</para>
</callout>
<callout arearefs="context">
<para>
This is the &ldquo;context&rdquo; of the patch. The
<command>diff</command> must always find
<emphasis>three consecutive lines</emphasis> that have remained
unchanged before and after the patched lines.
</para>
</callout>
<callout arearefs="minus">
<para>
This is the first part of the actual patch: the lines marked with
&ldquo;<![CDATA[-]]>&rdquo; represent the code of the file identified
previously with the minus signs.
</para>
</callout>
<callout arearefs="plus">
<para>
This is the second part of the patch: the lines marked with
&ldquo;<![CDATA[+]]>&rdquo; represent the code of the file identified
previously with the plus signs.
</para>
</callout>
</calloutlist>
</programlistingco>
What the simple patch above does is filling in the
<parameter>EXTRAVERSION</parameter> parameter that &linux; provides to
identify different build versions of the same kernel. In this case,
the <parameter>-rthal5</parameter> identifier is added.
</para>
<para>
Here is a small list of &ldquo;peculiarities&rdquo; that (&linux; and
&rtai;) kernel programmers tend to use quite often, and that could
make reading kernel source code a bit tedious:
<itemizedlist>

<listitem>
<para>
<emphasis>Magic numbers</emphasis>: these are seemingly random
numbers, that appear in many data structures. An example is found in
the file <filename>include/rtai_sched.h</filename>:
<programlisting>
<![CDATA[
#define RT_TASK_MAGIC 0x754d2774
]]>
</programlisting>
This magic number is filled in in the <parameter>RT_TASK</parameter>
data structure, in the function <function>
rt_task_init_cpuid()</function> in the file
<filename>mupscheduler/rtai_sched.c</filename>:
<programlisting>
<![CDATA[
task->magic = RT_TASK_MAGIC
]]>
</programlisting>
This data structure contains all information about an &rtai; task.
Since the kernel code is in &ccc;, and a lot of use is made of
pointers to data structures, the magic numbers are used to check
whether a pointer is pointing to the right data structure: if that is
indeed the case, the magic number must be found at a prescribed place.
In the <parameter>RT_TASK</parameter> example above, this check is
performed many time in the scheduler code, as follows:
<programlisting>
<![CDATA[
if (task->magic != RT_TASK_MAGIC) { return -EINVAL; }
]]>
</programlisting>
where the error parameter <parameter>EINVAL</parameter> encodes an
invalid situation.
</para>
</listitem>

<listitem>
<para>
<parameter>do {...} while(0);</parameter>. This kind of construct
appears quite often, especially in macro definitions in header files.
At first sight, this seems a complicated procedure to execute the code
between the braces just once, but in the context of macros it has a
useful side-effect: using this <parameter>while</parameter> construct
guarantees that compilers will not optimize anything away inside the
construct, and they consider the whole construct as one single
programming primitive (i.e., macro parameter), instead of the several
individual statements that occur inside of the
<parameter>while</parameter> scope. (See the
<ulink
 url="http://kernelnewbies.org/faq/index.php3#dowhile">kernelnewbies FAQ
</ulink>
for more details.)
One example is found in the &rtai;
patch (<filename>patches/patch-2.4.18-rthal5g</filename>):
<programlisting>
<![CDATA[
-#define prepare_to_switch()    do { } while(0)
+#define prepare_to_switch() do {           \
+       if (rthal.lxrt_global_cli) {        \
+               rthal.lxrt_global_cli();    \
+       }                                   \
+} while(0)
]]>
</programlisting>
</para>
</listitem>

<listitem>
<para>
<parameter>call *SYMBOL_NAME(rthal + 8)</parameter>: these assembly
language constructs are used to call a function at byte offset
&ldquo;8&rdquo; in the <parameter>rthal</parameter> data struture.
This is the data structure used for the hardware abstraction layer,
<xref linkend="rtai-rthal">.  This complicated way to call a function
allows to call different functions according to what is filled in in
that data structure.  &rtai; uses it to replace &linux; function calls
with its own function calls. The patch files contain a couple of
examples.
</para>
</listitem>

</itemizedlist>
</para>

</sect1>


<sect1 id="rtai-rthal">
<title>Hardware abstraction layer</title>
<para>
The <emphasis>RTHAL<indexterm>
<primary>RTHAL</primary></indexterm></emphasis> (Real-Time
Hardware Abstraction Layer), is, not surprisingly,
<emphasis>very</emphasis> platform-dependent. Its code typically
contains lots of assembler code that builds the low-level
<emphasis>infrastructure</emphasis>, not only for the HAL, but also
for the &linux; compatibility layer
(<xref linkend="rtai-linux-compat">), the core
(<xref linkend="rtai-core">) and for &lxrt;
(<xref linkend="rtai-lxrt">). A large part of that code comes from
&rtai;'s <emphasis>patch<indexterm>
<primary>patch</primary></indexterm></emphasis>. 
The main patch fragments (as far as the HAL is concerned) are for the
<filename>arch/xyz/kernel/irq.c</filename> and
<filename>include/asm-xyz/system.h</filename> files in the &linux;
source tree.
(Replace &ldquo;<filename>xyz</filename>&rdquo; with a suported
architecture, such as <acronym>arm</acronym>, <acronym>i386</acronym>
or <acronym>ppc</acronym>.)
The patch adds the <parameter>rthal</parameter> data structure to the
<filename>include/asm-xyz/system.h</filename> file of the &linux;
source, and changes the interrupt handling and management functions
that &linux; uses. This <parameter>rthal</parameter> is the central
data structure of &rtai;'s HAL: it collects the variables and function
calls that &linux; uses for interrupts (vector, flags, &cpu; affinity,
i.e., the <emphasis>hardware</emphasis> abstraction), and task
switching (which is the basis for the &rtai; core, <xref
linkend="rtai-core">).  In &rtai; 24.1.9, the
<parameter>rthal</parameter> data structure looks as follows:
<programlistingco>
<areaspec>
<area id="iret"      coords=2>
<area id="switchto"  coords=3>
<area id="idt"       coords=4>
<areaset id="ints"   coords="">
   <area id="ints.1" coords=5>
   <area id="ints.2" coords=6>
   <area id="ints.3" coords=7>
   <area id="ints.4" coords=8>
   <area id="ints.5" coords=9>
   <area id="ints.6" coords=10>
   <area id="ints.7" coords=11>
</areaset>
<areaset id="smp"    coords="">
   <area id="smp.1"  coords=12>
   <area id="smp.2"  coords=13>
</areaset>
<area id="ack"       coords=14>
<areaset id="lxrt"   coords="">
   <area id="lxrt.1" coords=15>
   <area id="lxrt.2" coords=16>
   <area id="lxrt.3" coords=17>
   <area id="lxrt.4" coords=18>
</areaset>
<area id="apic"      coords=19>
</areaspec>
<programlisting>
<![CDATA[
struct rt_hal rthal = {
  void *ret_from_intr;
  void *__switch_to; 
  struct desc_struct *idt_table; 
  void (*disint)(void);
  void (*enint)(void);
  unsigned int (*getflags)(void);
  void (*setflags)(unsigned int flags); 
  unsigned int (*getflags_and_cli)(void);
  void *irq_desc; 
  int *irq_vector;
  unsigned long *irq_affinity;
  void (*smp_invalidate_interrupt)(void);
  void (*ack_8259_irq)(unsigned int);
  int *idle_weight;
  void (*lxrt_global_cli)(void);    
  void (*switch_mem)(struct task_struct *, struct task_struct *, int);
  struct task_struct **init_tasks; 
  unsigned int *apicmap;
};
]]>
</programlisting>
<calloutlist>
<callout arearefs="iret">
<para>
Pointer to the &ldquo;return from interrupt&rdquo; call. By
adapting this call, it's not &linux; but &rtai; that decides what will
be done next, after an interrupt routine has finished.
(TODO: This pointer seems not to be changed any more during &rtai;'s
lifetime; is it still needed in the RTHAL?)
</para>
</callout>
<callout arearefs="switchto">
<para>
Pointer to the function that does a task switch. Again, it should be
&rtai; that controls which task to switch to.
</para>
</callout>
<callout arearefs="idt">
<para>
Pointer to the
<emphasis>Interrupt Description Table (IDT)</emphasis><indexterm>
<primary>Interrupt Description Table</primary></indexterm>,
<indexterm> <primary>IDT</primary></indexterm>
the data structure that holds the <emphasis>status</emphasis> of how
interrupts behave: what is the interrupt service routine attached to
an interrupt, what interrupts are enabled, and
what are their priority and status.
</para>
</callout>
<callout arearefs="ints">
<para>
These lines contain the pointers to the fundamental interrupt
<emphasis>management functions</emphasis> (disable and enable
interrupts, with or without saving of the interrupt status flags), and
data structures (interrupt descriptor (which IRQ to handle on which
&cpu;).
These pointers are filled in when &rtai; is enabled. This happens in
the function <function>__rtai_mount</function>, implemented in
<filename>arch/xyz/rtai.c</filename>.
</para>
</callout>
<callout arearefs="smp">
<para>
These are only needed in an SMP system.
The &irq; affinity remebers which interrupt numbers are possibly
reserved to what &cpu; number; the data is filled in in 
<filename>arch/xyz/rtai.c</filename>.
The <function>smp_invalidate_interrupt()</function> function is
defined in &linux;: <filename>arch/xyz/kernel/smp.c</filename>: a
&cpu; in a multi-&cpu; system
can raise a &ldquo;request for TLB invalidate&rdquo; interrupt to
signal when a page in memory has been changed, such that others can
take appropriate action to update their caches. &rtai; can catch this
interrupt, and decide when to give it to &linux;.
</para>
</callout>
<callout arearefs="ack">
<para>
This is the function with which to acknowledge the interrupts from the
timer. (The name is too much bound to the traditional 8259 timer
chips; many others are in use nowadays.)
</para>
</callout>
<callout arearefs="lxrt">
<para>
This is used in &lxrt; scheduling; see
<filename>lxrt/lxrt.c</filename>.
The <function>init_tasks()</function> function is defined in &linux;: 
<filename>arch/xyz/kernel/smp.c</filename>.
</para>
</callout>
<callout arearefs="apic">
<para>
Points to a &linux;-defined vector (in
<filename>arch/xyz/kernel/smpboot.c</filename>) called 
&ldquo;physical_apicid_2_cpu&rdquo;, which is filled at boot time and
maps the physical APIC (<xref linkend="sect-inter-hw">) interrupt
controller identifiers to logical &cpu; identifiers.
</para>
</callout>
</calloutlist>
</programlistingco>
The form of the entries in the
<emphasis>Interrupt Descriptor Table</emphasis> data structure is
defined in &linux; (<filename>linux/irq.h</filename>):
<programlisting>
<![CDATA[
typedef struct {
   unsigned int status;        // IRQ status 
   hw_irq_controller *handler; // functions to manage hardware interrupts
                                  // (see below)
   struct irqaction *action;   // IRQ action list
                                  // (see below)
   unsigned int depth;         // nested irq disables 
   spinlock_t lock;            // lock used to access handler and
                               // action lists
} ____cacheline_aligned irq_desc_t;

]]>
</programlisting>
This pointer to <parameter>irq_desc_t</parameter> is filled in in
<filename>arch/xyz/rtai.c</filename>.  The file
<filename>linux/interrupt.h</filename> defines the
<parameter>irqaction</parameter> field, that contains all information
about a specific interrupt handler:
<programlisting>
<![CDATA[
struct irqaction {            
      // function to execute:
   void (*handler)(int, void *, struct pt_regs *);
   unsigned long flags;    // saved flags at moment of interrupt
   unsigned long mask;     // interrupt mask
   const char *name;       // name of handler
   void *dev_id;           // identifier of interrupting device
   struct irqaction *next; // pointer to next handler structure
};
]]>
</programlisting>
And the file <filename>linux/irq.h</filename> defines the
<parameter>hw_irq_controller</parameter> data structure:
<programlisting>
<![CDATA[
struct hw_interrupt_type {
   const char * typename;
   unsigned int (*startup)(unsigned int irq);
   void (*shutdown)(unsigned int irq);
   void (*enable)(unsigned int irq);
   void (*disable)(unsigned int irq);
   void (*ack)(unsigned int irq);
   void (*end)(unsigned int irq);
   void (*set_affinity)(unsigned int irq, unsigned long mask);
};

typedef struct hw_interrupt_type  hw_irq_controller;
]]>
</programlisting>
This data structure contains pointers to the functions needed to
manage the hardware interrupts: how to enable and disable an
interrupt, how to acknowledge an interrupts, how to attach an
interrupt to a set of &cpu;s (&ldquo;affinity&rdquo;), etc.
</para>
<para>
The core of the HAL patch works as follows:
<orderedlist>

<listitem>
<para>
&rtai; replaces &linux; functions that work with the interrupt
hardware with <emphasis>pointers</emphasis> to functions.
</para>
</listitem>

<listitem>
<para>
&rtai; introduces the <parameter>rthal</parameter> data structure to
store all these pointers.
</para>
</listitem>

<listitem>
<para>
&rtai; can now switch these pointers to pointers to its own functions
whenever it wants.
</para>
</listitem>

</orderedlist>
An example of this approach to replace original &linux; functions with
pointer entries from the <parameter>rthal</parameter> data structure
can be seen in the patch to the <filename>system.h</filename> file:
<programlisting>
<![CDATA[
#define __cli()                (rthal.disint())
#define __sti()                (rthal.enint())
#define __save_flags(x)        ((x) = rthal.getflags())
#define __restore_flags(x)     (rthal.setflags(x))
]]>
</programlisting>
Here, the disable and enable interrupt functions are replaced, as well
as the functions that save and restore the interrupt status flags.
The patch, of course, also introduces &ldquo;hard&rdquo; versions of
these functions, so that &rtai; can work on the real hardware, while
&linux; works with the &ldquo;soft&rdquo; versions (i.e., the
interrupts for these soft versions come from the &rtai; software, and
not from the hardware). For example, the
<function>hard_cli()</function> and <function>hard_sti()</function>
functions (that the patch adds to the &linux;'s
<filename>include/asm-xyz/system.h</filename> file) get the
functionality of the original <function>__cli()</function> and
<function>__sti()</function> of &linux;. This is again done in the
patch file:
<programlisting>
<![CDATA[
+#define hard_sti() __asm__ __volatile__ ("sti": : :"memory")
+#define hard_cli() __asm__ __volatile__ ("cli": : :"memory")
]]>
</programlisting>
The original <function>__cli()</function> and
<function>__sti()</function> of &linux; are replaced by soft
versions, as seen in the code example above. 
</para>
<para>
Another (assembly code) example of the procedure to let &linux; work
with &ldquo;intercepted&rdquo; function calls, is the following
patch fragment for the <filename>arch/i386/kernel/entry.S</filename>
file:
<programlistingco>
<areaspec>
<area id="area1"      coords=11>
<areaset id="area2"   coords="">
   <area id="area2.1" coords=16>
   <area id="area2.2" coords=17>
   <area id="area2.3" coords=24>
   <area id="area2.4" coords=25>
</areaset>
</areaspec>
<programlisting>
<![CDATA[
 ENTRY(ret_from_fork)
+       sti
        pushl %ebx
        call SYMBOL_NAME(schedule_tail)
        addl $4, %esp
@@ -202,17 +203,20 @@
        call *SYMBOL_NAME(sys_call_table)(,%eax,4)
        movl %eax,EAX(%esp)   # save the return value
 ENTRY(ret_from_sys_call)
-       cli            # need_resched and signals atomic test
+       call *SYMBOL_NAME(rthal + 12)
        cmpl $0,need_resched(%ebx)
        jne reschedule
        cmpl $0,sigpending(%ebx)
        jne signal_return
+       sti
+       call *SYMBOL_NAME(rthal + 16)
 restore_all:
        RESTORE_ALL

        ALIGN
 signal_return:
-       sti     # we can get here from an interrupt handler
+       sti   # we can get here from an interrupt handler
+       call *SYMBOL_NAME(rthal + 16)
        testl $(VM_MASK),EFLAGS(%esp)
        movl %esp,%eax
        jne v86_signal_return
]]>
</programlisting>
<calloutlist>
<callout arearefs="area1">
<para>
The original <function>cli</function> (to disable interrupts) is
replaced by a call to the function that sits on offset
&ldquo;12&rdquo; in the <parameter>rthal</parameter> data structure.
With four bytes per pointer, this corresponds to the fourth line of
that data structure, i.e., the place where &rtai; fills in its own
version of the disable interrupt call.
</para>
</callout>
<callout arearefs="area2">
<para>
Similarly, but now for the function at offset &ldquo;16&rdquo;, i.e.,
the enable interrupt function of &rtai;.
</para>
</callout>
</calloutlist>
</programlistingco>
All the adapted function definitions are finally filled in in the
<parameter>rthal</parameter> data structure in the file
<filename>arch/xyz/kernel/irq.c</filename>:
<programlisting>
<![CDATA[
struct rt_hal rthal = {
   &ret_from_intr,
   __switch_to,
   idt_table,
   linux_cli,
   linux_sti,
   linux_save_flags,
   linux_restore_flags,
   linux_save_flags_and_cli,
   irq_desc,
   irq_vector,
   irq_affinity,
   smp_invalidate_interrupt,
   ack_8259_irq,
   &idle_weight,
   0,   // lxrt_global_cli
   switch_mem,
   init_tasks,
   physical_apicid_2_cpu
};
]]>
</programlisting>
That is, they get pointers to their original &linux; functions, or to
patched functions that have the original &linux; behaviour. The reason
is, of course, that, at boot time, the system should behave as normal
&linux;.
(Some of the entries in the <parameter>rthal</parameter> data structure
have not been discussed yet, because they do not really belong to the
<emphasis>hardware</emphasis> abstraction, but are meant to support
the core functionality of &rtai;, <xref linkend="rtai-core">.)
So, at boot time, &linux; runs as if nothing has happened, except for
a small loss in performance, due to the extra level of indirection
introduced by replacing function calls by
<emphasis>pointers to function calls</emphasis> in the
<parameter>rthal</parameter> structure. The user can
activate the &rtai; functionality at any later time, via a loadable
module that executes the <function>rt_mount_rtai()</function> (file
<filename>arch/xyz/rtai.c</filename>). This
switches the pointers to functions in the <parameter>rthal</parameter>
data structure from their &linux; version to their &rtai; version.
From that moment on, &linux; is under control of the &rtai; kernel,
because &linux; works with what it thinks is the &ldquo;real&rdquo;
hardware through the replacement functions that &rtai; has installed.
But these functions give a <emphasis>virtual</emphasis> hardware to
&linux;, while &rtai; manages the real hardware. For example, &rtai;
queues interrupts for &linux; <emphasis>in software</emphasis> until
&linux; gets a chance to run again; at that moment, the pending
interrupts seem to come from the hardware, as far as the &linux; side
is concerned.
</para>

<para>
In principle, the HAL could be used for other purposes than serving
as a stub for the &rtai; core. That is, another kind of operating
system could be implemented on top of the RTHAL. But also the opposite
could be done, i.e., implementing the same &rtai; core on top of another
low-level stub. This is what is being done in the ongoing integration
of &rtai; and &adeos; (<xref linkend="sect-adeos">).
This effort, however, experiences some problems because &rtai;
currently doesn't make a clean distinction between what is needed for
a real <emphasis>hardware abstraction</emphasis> on the one hand, and
what is needed for <emphasis>replacing &linux</emphasis> on the other
hand. So, it is not straightforward to get the RTHAL alone, without
any mention of the &rtai; core or the &linux; compatibility
structures. For example, the patch and the
<filename>include/arch/xyz/rtai.c</filename> file mixe both parts.
</para>
<para>
(TODO: explain implementation of all &rtai; spinlocks and interrupt
disabling/enabling functions; and <function>dispatch_trap</function> in 
<filename>rtai.c</filename> (checks whether traps come from NMI,
&linux;, debugger, of &rtai;.); what do SRQs do?
srqisr(),rt_request_srq() in rtai.c? use 0xFE as IRQ, 
<parameter>#define RTAI_SYS_VECTOR 0xFE</parameter> in
<filename>include/asm-i386/rtai_srq.h</filename>? 
rtai_open_srq(); implementation of barrier
<filename>bits/rtai_bits.c</filename>)
</para>

</sect1>


<sect1 id="rtai-linux-compat">
<title>Linux compatibility layer</title>
<para>
&rtai; is developed to cooperate closely with &linux;, and to let
&linux; take care of all non hard real-time tasks, such as networking,
file IO, user interfacing, etc. But the cooperation with &linux; is a
<emphasis>one-way</emphasis> endeavour: &linux; development doesn't
(want to) care about how it could facilitate development of an &rtos;
below it. And its data structures are not fully appropriate for
real-time. So, &rtai; must place hooks in the &linux; code, for the
following things:
<itemizedlist>

<listitem>
<para>
Task data structures.
</para>
</listitem>

<listitem>
<para>
Timing.
</para>
</listitem>

<listitem>
<para>
&lxrt; (<xref linkend="rtai-lxrt">): this requires interaction with
&linux; scheduling.
</para>
</listitem>

</itemizedlist>
</para>
<para>
A first part of the  
&linux; compatibility interface.
consists of data structures in which &rtai; stores the state in
which it finds the running &linux; kernel at the moment that it (i.e.,
&rtai;) becomes active
(<filename>arch/xyz/rtai.c</filename> and
<filename class=headerfile>include/asm-xyz/rtai.h</filename>):
<programlisting>
<![CDATA[
static struct rt_hal linux_rthal;
static struct desc_struct linux_idt_table[256];
static void (*linux_isr[256])(void);
static struct hw_interrupt_type *linux_irq_desc_handler[NR_GLOBAL_IRQS];
]]>
</programlisting>
This state is restored when the &rtai; module is unloaded.
The &linux; state is stored, and &rtai; functionality is loaded, in the
<function>init_module()</function> of <filename>rtai.c</filename>.
This file, and its <filename>include/rtai-xy.h</filename> header file
(with <filename>xy</filename> the &rtai; version), further implement
the basic function calls of a hard real-time kernel
(<xref linkend="rtai-core">). Note that
<emphasis>global</emphasis> locks (similar to the 
<emphasis>Big Kernel Lock</emphasis>,<indexterm>
<primary>Big Kernel Lock</primary></indexterm>
<xref linkend="arch">, are available in &rtai;. These locks, however,
cannot be taken by user space processes, because the global locks in
&linux; have been virtualised.
</para>
<para>
The <parameter>rthal</parameter> data structure in the &rtai; patch
contains not only <emphasis>hardware-related</emphasis> fields
(everything concerning interrupts), but also some
<emphasis>software-related</emphasis> entries, such as task switching
functions that have to work together with &linux;. For example, the 
patch extends the <parameter>task_struct</parameter> data structure in
<filename>include/linux/sched.h</filename> of the &linux; source with
<programlisting>
<![CDATA[
void *this_rt_task[2];
]]>
</programlisting>
to accomodate a real-time tasks queue. The two pointers to real-time
tasks are initialized to 0:
<programlisting>
<![CDATA[
this_rt_task:      {0,0}
]]>
</programlisting>
because at &linux; boot, no real-time task exist yet.
The other <emphasis>&linux;-compatibility</emphasis> entries in
the <parameter>rthal</parameter> data structure are:
<programlisting>
<![CDATA[
struct rt_hal rthal = {
  void *ret_from_intr;
  void *__switch_to; 
...
  int *idle_weight;
...
  void (*switch_mem)(struct task_struct *, struct task_struct *, int);
  struct task_struct **init_tasks; 
...
};
]]>
</programlisting>
The patch adds code to the source of the
<filename>linux/kernel/exit.c</filename> file in the &linux;
source, to execute a <emphasis>callback<indexterm>
<primary>callback</primary></indexterm></emphasis> to &rtai; at the
moment that a real-time task is stopped.
In <filename>linux/kernel/sched.c</filename> of the &linux; source,
the scheduler is extended to work also with the &lxrt; tasks
(<xref linkend="rtai-lxrt">).
</para>

</sect1>


<sect1 id="rtai-core">
<title>RTOS core</title>
<para>
The &rtos; core relies on the RTHAL and &linux; compatibility
&ldquo;tricks&rdquo; of the previous sections, to build a hard
real-time kernel on top of the interrupt system of the hardware, and
integrated with the task management of &linux;.
<xref linkend="chap-rtai"> gives
more details about <emphasis>what</emphasis> functionality is offered;
this section deals with <emphasis>how</emphasis> &rtai; implements
this functionality. The core's functionality consists of:
task management and scheduling, interrupts and traps, synchronization
and data exchange, and memory management.
</para>
<para>
The code of the &rtai; core resides in the
<filename class=headerfile>include/asm-generic/rtai.h</filename>,
<filename class=headerfile>include/asm-xyz/rtai.h</filename> and
<filename>include/arch/xyz/rtai.c</filename> files.
The central data structure is the one that stores the global status of
the &rtai; core:
<programlisting>
<![CDATA[
struct global_rt_status {
  volatile unsigned int pending_irqs;
  volatile unsigned int activ_irqs;
  volatile unsigned int pending_srqs;
  volatile unsigned int activ_srqs;
  volatile unsigned int cpu_in_sti;
  volatile unsigned int used_by_linux;
  volatile unsigned int locked_cpus;
  volatile unsigned int hard_nesting;
  volatile unsigned int hard_lock_all_service;
  spinlock_t hard_lock;
  spinlock_t data_lock;
};
]]>
</programlisting>
</para>


<sect2 id="rtai-core-tasksched">
<title>Task management and scheduling</title>
<para>
</para>
<para>
Task switching happens through the
<parameter>switch_to</parameter> function in the
<parameter>rthal</parameter> data structure; this can be seen in the
<programlisting>
<![CDATA[
"jmp *"SYMBOL_NAME_STR(rthal + 4)"\n"
]]>
</programlisting>
The function on offset &ldquo;4&rdquo; in <parameter>rthal</parameter>
is indeed <parameter>__switch_to</parameter>. Also in this file is the
trap handling; the relevant part in this assembler code is where the
appropriate handler is called:
<programlisting>
<![CDATA[
"call "SYMBOL_NAME_STR(lxrt_handler)"
]]>
</programlisting>
This handler is filled in in (TODO).
At the end of this assembler code, the &ldquo;return from
interrupt&rdquo; is performed, again by calling the corresponding
functions on the <parameter>rthal</parameter> data structure:
<programlisting>
<![CDATA[
"1:call *" SYMBOL_NAME_STR(rthal + 16) "\n\t"
"jmp *" SYMBOL_NAME_STR(rthal)
]]>
</programlisting>
</para>
<para>
<emphasis>Time management</emphasis> is very important for a real-time
operation system, so &rtai; has a bunch of functionality in its core
to work with the time hardware. The low-level functions can be found
in <filename>arch/xyz/rtai.c</filename>; for example:
<programlisting>
<![CDATA[
int rt_request_timer(
  void (*handler)(void),
  unsigned int tick,
  int apic)

void rt_free_timer(void)

void rt_request_timer_cpuid(
  void (*handler)(void),
  unsigned int tick,
  int cpuid)

void rt_request_apic_timers(
  void (*handler)(void),
  struct apic_timer_setup_data *apic_timer_data)

void rt_free_apic_timers(void)

void setup_periodic_apic(
  unsigned int count,
  unsigned int vector)

void setup_oneshot_apic(
  unsigned int count,
  unsigned int vector)

...
]]>
</programlisting>
</para>

</sect2>

<sect2 id="rtai-core-int">
<title>Interrupts and traps</title>
<para>
The &ldquo;<emphasis>encoded trap<indexterm>
<primary>encoded trap</primary></indexterm>
<indexterm>
 <primary>trap</primary><secondary>encoded</secondary>
</indexterm></emphasis>&rdquo; technique consists of two parts:
<orderedlist>

<listitem>
<para>
Allowing a user space task to execute a kernel function.
</para>
</listitem>

<listitem>
<para>
Incorporating a user space task into the real-time scheduling.
This requires an adaptation of the standard &linux; task bookkeeping.
</para>
</listitem>

</orderedlist>
This first functionality is implemented via the use of a
<emphasis>trap<indexterm><primary>trap</primary></indexterm></emphasis>
(<xref linkend="sect-inter-sw">). The trap allows the user space task
to launch a kernel space function (the &ldquo;trap handler&rdquo;).
The user space task <emphasis>encodes</emphasis> the desired
real-time service in a set of two integers that it puts on the trap
handler stack; it can, in addition, also pass some
<emphasis>arguments</emphasis> to the trap handler. 
<function>dispatch_trap()</function> in
<filename>arch/xyz/rtai.c</filename> does the trap handling.</para>

</sect2>


<sect2 id="rtai-core-ipc">
<title>IPC</title>
<para>
locks, etc.
</para>

</sect2>


<sect2 id="rtai-core-mm">
<title>Memory management</title>
<para>
<function>mlockall( MCL_CURRENT | MCL_FUTURE)</function>: &posix;
function (coming from the &linux; source tree:
<filename>linux/include/asm-xyz/mman.h</filename>) that locks all pages
of the calling task in memory; the parameters are macros that indicate
that all current pages must be locked, but also all pages that the
task will ask in the future.
</para>

</sect2>

</sect1>


<sect1 id="rtai-lxrt">
<title>LX/RT</title>
<para>
&lxrt; stands for &ldquo;&linux;/real-time&rdquo;, i.e., it
offers <emphasis>soft</emphasis> and
<emphasis>hard</emphasis> real-time functionality to &linux; user
space tasks. This functionality is only slightly limited with respect
to what can be achieved in kernel space. The ultimate goal is a fully
&ldquo;<emphasis>symmetric &api;</emphasis>&rdquo;, i.e., to offer the
same real-time &api; to user space tasks as what is available to
&rtai; kernel tasks. A symmetric &api;, available in user space,
reduces the threshold for new
users to start using real-time in their applications, but it also
allows for easier debugging when writing new applications. The bad
news is that it makes understanding the &rtai; code a bit more
difficult, because similar function calls get different
implementations, depending on their usage in kernel space or in user
space.
This symmetry, obviously, can never be absolute and only works from
kernel space to user space, and not the other way around: it is not
possible to bring an <emphasis>arbitrary</emphasis> user space
function to the kernel, because it would use functions that are not
available in the kernel.  Also, the user space task that one wants to
execute in hard real-time via &lxrt; should satisfy all constraints of
hard real-time: no undeterministic or blocking calls, etc.
</para>
<para>
The &lxrt; idea is quite old, actually, and has gone through various
stages of evolution. The <emphasis>first generation</emphasis> used
the idea to let a user space task run a companion task in the kernel,
i.e., the so-called &ldquo;buddy&rdquo;<indexterm>
<primary>buddy</primary><secondary>&rtai;</secondary>
</indexterm>
in &rtai; language. This companion task executes
kernel space functions on behalf of the user space task. Technically speaking,
this is realized by passing an identifier of the required function to
a <emphasis>trap handler</emphasis>, which then executes the function
call that corresponds to this identifier
(<xref linkend="rtai-core-int">); there is another &ldquo;kernel space/user
space&rdquo; switch to return. 
</para>
<para>
The <emphasis>second generation</emphasis> design of &lxrt; 
(appropriately called <emphasis>new &lxrt;</emphasis>)<indexterm>
<primary>new &lxrt;</primary></indexterm>
<indexterm>
 <primary>&lxrt;</primary><secondary>new</secondary>
</indexterm>
needs only one switch, doesn't use a &ldquo;buddy&rdquo; anymore, and
integrates maximally with existing &linux; task scheduling. This means
that &linux; is not any more the &ldquo;idle&rdquo; task of the
&rtos;, but &linux; itself has been extended with full pre-emption and
real-time scheduling (for tasks that obey certain restrictions).
</para>
<para>
This clear distinction between first and second generation has only
occurred <emphasis>after</emphasis> the facts: there have been several
prototypes in various releases of &rtai;, with names such as
&ldquo;&lxrt; extended&rdquo;,<indexterm>
<primary>&lxrt; extended</primary></indexterm>
<indexterm>
 <primary>&lxrt;</primary><secondary>extended</secondary>
</indexterm>
&ldquo;ALLSOFT&rdquo;,<indexterm>
<primary>ALLSOFT</primary></indexterm>
<indexterm>
 <primary>&lxrt;</primary><secondary>ALLSOFT</secondary>
</indexterm>
or
&ldquo;USP&rdquo;.<indexterm>
<primary>USP</primary></indexterm>
<indexterm>
 <primary>&lxrt;</primary><secondary>USP</secondary>
</indexterm>
This has led to some confusion, but in the future only the two
above-mentioned approaches will be supported.
</para>
<para>
From a user's perspective, the <emphasis>difference</emphasis> between
the soft and hard versions of &lxrt; is that the hard version disables
software interrupts when the &lxrt; task runs.
</para>


<sect2 id="rtai-lxrt-user">
<title>LX/RT for the user</title>
<para>
First, make the &lxrt; functionality available by loading the &lxrt;
module, so that your tasks can use it. A typical &lxrt;
user task looks like this:
<programlistingco>
<areaspec>
<areaset id="x" coords="">
   <area id="x.1" coords=1>
</areaset>
<area id="y" coords=1>
</areaspec>
<programlisting>
<![CDATA[
... TODO ...
]]>
</programlisting>
<calloutlist>
<callout arearefs="x">
<para>
</para>
</callout>
</calloutlist>
</programlistingco>
A maximum of <parameter>MAX_SRQ</parameter> &linux; tasks can be made
into hard real-time &lxrt; tasks. (This constant is set to 128 in
&rtai; 24.1.9, in the file
<filename class=headerfile>include/rtai_lxrt.h</filename>.)
The user space task can also
<emphasis>register a name</emphasis> for itself, consisting of at most
six characters. This naming allows a &lxrt; task to call all &lxrt;
functions via their &ldquo;named&rdquo; version; for example,
<function>rt_named_task_init()</function>. The task name can also be
used by other tasks than the one that gave the name, for example to
send messages to each other.
</para>
<para>
A user space task is made into an &lxrt; user space task by using only
a couple of &lxrt; calls. The task calls
<function>rt_make_hard_real_time()</function> (in
<filename>include/rtai_lxrt.h</filename>) at the moment it wants
to switch to real-time, and
<function>rt_make_soft_real_time()</function> to switch back.
(Commenting out these functions is an easy way to allow user space
debugging.) &lxrt; also provides function
calls to detect whether the calling task is currently running in hard
real-time (<function>rt_is_linux()</function>, in
<filename>arch/xyz/rtai.c</filename>) or in &lxrt:
(<function>rt_is_lxrt()</function>), and whether it
wants to use the floating point registers of the &cpu;.
</para>
<para>
(TODO: name registry.)
</para>

</sect2>

<sect2 id="rtai-lxrt-code">
<title>LX/RT implementation</title>
<para>
(TODO: sketch the dependencies between files in &lxrt;;
document the encoding; stuff already done in
<filename>Documentation/README.EXTENDING_LXRT</filename>.)
</para>
<para>
The <function>rt_task_init()</function>,
implemented in the file <filename>include/rtai_lxrt.h</filename>. Its
arguments are:
<programlisting>
<![CDATA[
LX_TASK *rt_task_init(
   unsigned int tasknum,  // number of the task
   int prio,              // desired priority level
   int stack_size,        // allocated stack size
   int max_msg_size)      // max size of inter-task message buffer
]]>
</programlisting>
This function call eventually ends up in the
<function>__task_init()</function> in
<filename>lxrt/lxrt.c</filename>, which initializes all parameters of
the task and allocates the memory required for them. These are not
only the parameters that the application programmer sets, but also the
parameters needed behind the screens: the memory to communicate data
to the trap handler, and the task data structure.
</para>
<para>
The real heavy part of &lxrt; lies in
<function>__lxrt_resume()</function> in
<filename>lxrt/lxrt.c</filename>. This function takes care of the
seemless integration with &linux; task management. 
One of the calls it makes is to <function>__emuser_trxl()</function>
in
<filename>XYZscheduler/rtai_sched.c</filename> (where
<function>XYZ</function> stands for
&ldquo;<function>up</function>&rdquo;
(uni-processor),
&ldquo;<function>mup</function>&rdquo;
(multiple uni-processors), or
&ldquo;<function>smp</function>&rdquo;
(symmetric multi-processor):
<programlistingco>
<areaspec>
<area id="ready"   coords=4>
<area id="enq"     coords=5>
<area id="switch"  coords=6>
<area id="save"    coords=7>
<area id="switch2" coords=8>
<area id="signal"  coords=9>
</areaspec>
<programlisting>
<![CDATA[
static inline void __emuser_trxl(RT_TASK *new_task)
{

  if ((new_task->state |= READY) == READY) {
    enq_ready_task(new_task);
    rt_switch_to_real_time(0);
    save_cr0_and_clts(linux_cr0);
    rt_switch_to(new_task);
    if (rt_current->signal) { (*rt_current->signal)(); }
  }
}
]]>
</programlisting>
<calloutlist>
<callout arearefs="ready">
<para>
&hellip;
</para>
</callout>
<callout arearefs="enq">
<para>
&hellip;
</para>
</callout>
<callout arearefs="switch">
<para>
&hellip;
</para>
</callout>
<callout arearefs="save">
<para>
&hellip;
</para>
</callout>
<callout arearefs="switch2">
<para>
&hellip;
</para>
</callout>
<callout arearefs="signal">
<para>
&hellip;
</para>
</callout>
</calloutlist>
</programlistingco>
Two important functions in <filename>lxrt/lxrt.c</filename> are:
<function>steal_from_linux()</function>,
and <function>give_back_to_linux()</function>:
<programlistingco>
<areaspec>
<area id="globalcli"   coords=6>
<area id="ffnz"        coords=7>
<area id="klistp"      coords=8>
<area id="hardcli"     coords=9>
<area id="rttask1"      coords=10>
<area id="maxsrq"      coords=11>
<area id="hardsti"     coords=12>
<area id="state"       coords=13>
<area id="wakeup"      coords=14>
<area id="schedule"    coords=15>
<area id="ishard1"     coords=16>
<area id="hardsti2"    coords=17>
<area id="fpu"         coords=18>
</areaspec>
<programlisting>
<![CDATA[
static void steal_from_linux(RT_TASK *rt_task)
{
   int cpuid;
   struct klist_t *klistp;
      ...
   rthal.lxrt_global_cli = linux_lxrt_global_cli;
   cpuid = ffnz((rt_task->lnxtsk)->cpus_allowed);
   klistp = klistbp[cpuid];
   hard_cli();
   klistp->task[klistp->in] = rt_task;
   klistp->in = (klistp->in + 1) & (MAX_SRQ - 1);
   hard_sti();
   current->state = TASK_LXRT_OWNED;
   wake_up_process(kthreadb[cpuid]);
   schedule();
   rt_task->is_hard = 1;
   HARD_STI();
   if (current->used_math) { restore_fpu(current); }
}
]]>
</programlisting>
<calloutlist>
<callout arearefs="globalcli">
<para>
&hellip;
</para>
</callout>
<callout arearefs="enq">
<para>
&hellip;
</para>
</callout>
<callout arearefs="klistp">
<para>
&hellip;
</para>
</callout>
<callout arearefs="hardcli">
<para>
&hellip;
</para>
</callout>
<callout arearefs="rttask1">
<para>
&hellip;
</para>
</callout>
<callout arearefs="maxsrq">
<para>
&hellip;
</para>
</callout>
<callout arearefs="hardsti">
<para>
&hellip;
</para>
</callout>
<callout arearefs="state">
<para>
&hellip;
</para>
</callout>
<callout arearefs="wakeup">
<para>
&hellip;
</para>
</callout>
<callout arearefs="schedule">
<para>
&hellip;
</para>
</callout>
<callout arearefs="ishard1">
<para>
&hellip;
</para>
</callout>
<callout arearefs="hardsti2">
<para>
&hellip;
</para>
</callout>
<callout arearefs="fpu">
<para>
&hellip;
</para>
</callout>
</calloutlist>
</programlistingco>
<programlistingco>
<areaspec>
<area id="intrap"      coords=8>
<area id="signallin"   coords=9>
<areaset id="nottrap"  coords="">
  <area id="nottrap.1" coords=13>
  <area id="nottrap.2" coords=14>
  <area id="nottrap.3" coords=15>
  <area id="nottrap.4" coords=16>
</areaset>
<area id="remready"    coords=18>
<area id="ishard0"     coords=20>
</areaspec>
<programlisting>
<![CDATA[
static void give_back_to_linux(RT_TASK *rt_task, int in_trap)
{
  int cpuid;
  struct klist_t *klistp;
   ...
  cpuid = ffnz((rt_task->lnxtsk)->cpus_allowed);
  hard_cli();
  if (in_trap) {
          rt_signal_linux_task((void *)0, 0, rt_task);
  } else {
          klistp = klistep[cpuid];
          klistp->task[klistp->in] = rt_task->lnxtsk;
          klistp->in = (klistp->in + 1) & (MAX_SRQ - 1);
          rt_pend_linux_srq(sysrq.srq);
  }
  rem_ready_task(rt_task);
  lxrt_schedule(cpuid);
  rt_task->is_hard = 0;
  hard_sti();
}
]]>
</programlisting>
<calloutlist>
<callout arearefs="intrap">
<para>
&hellip;
</para>
</callout>
<callout arearefs="signallin">
<para>
&hellip;
</para>
</callout>
<callout arearefs="nottrap">
<para>
&hellip;
</para>
</callout>
<callout arearefs="remready">
<para>
&hellip;
</para>
</callout>
<callout arearefs="ishard0">
<para>
&hellip;
</para>
</callout>
</calloutlist>
</programlistingco>
&lxrt; uses &linux; kernel threads<indexterm>
<primary>kernel thread</primary></indexterm>
<indexterm>
 <primary>thread</primary><secondary>kernel</secondary>
</indexterm>
 <function>kthread_b()</function>
(ldquo;<emphasis>kernel thread begin</emphasis>&rdquo;)
and <function>kthread_e()</function>
(ldquo;<emphasis>kernel thread end</emphasis>&rdquo;), with its own
<function>lxrt_schedule()</function> scheduling.
</para>
<para>
(TODO: what do <function>kthread_b()</function>
and <function>kthread_e()</function> really do?)
</para>
<para>
&rtai; has had different versions of &lxrt; functionality. The later
ones are more robust against a task crash in the user space &linux;
side. At that moment, &linux; executes a
<function>do_exit()</function> function, and the &rtai; patch has
added a pointer to a callback function in that function.  The callback
is used to free the resources that where registered by the real-time
buddy. It also deletes the real-time buddy task, and unblocks any
other task that may have engaged in blocking &ipc; (e.g., via a
semaphore) with the real time task.
</para>
<para>
This section discusses the implementation of the &lxrt; techniques,
(at the time of writing, only fully supported on
<acronym>i386</acronym> hardware) as implemented in the following
files in the &rtai; source tree:
<itemizedlist>

<listitem>
<para>
<filename>include/asm-i386/rtai_lxrt.h</filename>
</para>
</listitem>

<listitem>
<para>
<filename>include/rtai_lxrt.h</filename>
</para>
</listitem>

<listitem>
<para>
<filename>lxrt/lxrt.c</filename>
</para>
</listitem>

<listitem>
<para>
<filename>ABCscheduler/rtai_sched.c</filename>, where
<filename>ABC</filename> stands for either 
<filename>up</filename> (<emphasis>uni-processor</emphasis>), or
<filename>mup</filename>
(<emphasis>multiple uni-processors</emphasis>), or
<filename>smp</filename>
(<emphasis>symmetric multi-processors</emphasis>).
</para>
</listitem>

</itemizedlist>
This involvement in &lxrt; of a file called &ldquo;scheduler&rdquo; is
one of these unfortunate things that make &rtai; a confusingly
documented project&hellip; The reason it is needed in the discussion
on &lxrt; is that it contains the implementation of the
<emphasis>&rtai; kernel space</emphasis> function
<function>rt_task_init()</function>, which starts up a kernel space
&ldquo;proxy<indexterm><primary>proxy</primary></indexterm>&rdquo;
(or <function>buddy_fun()</function> as it is called in
<filename>lxrt/lxrt.c</filename>) for each user space &lxrt; task that
calls the <emphasis>user space</emphasis> function with the same name,
<function>rt_task_init()</function>, but with different arguments.
This <function>buddy_fun()</function> function has, at first sight, a
strange implementation:
<programlisting>
<![CDATA[
struct fun_args { int a0; int a1; int a2; int a3; int a4; int a5; \
   int a6; int a7; int a8; int a9; long long (*fun)(int, ...); };

static void buddy_fun(RT_TASK *mytask)
{
  struct fun_args *arg = (void *)mytask->fun_args;
  while (1) {
    mytask->retval = arg->fun( \
      arg->a0, arg->a1, arg->a2,  arg->a3, arg->a4, \
      arg->a5, arg->a6, arg->a7, arg->a8, arg->a9);
    lxrt_suspend(mytask);
  }
}
]]>
</programlisting>
So, <emphasis>every</emphasis> &lxrt; task gets this same
<function>buddy_fun()</function> as its buddy. But yet the result of
executing it differs from task to task, and from activation to
activation of the buddy, because the buddy executes
the function (and the arguments) that it got in the
<emphasis>list of parameters</emphasis> from the
<function>lxrt_handler()</function> trap handler
(<xref linkend="rtai-core-int">).
So, the <function>buddy_fun()</function> remains suspended until the
user space task makes a real-time &lxrt; call; at that moment, the
&lxrt; scheduler wakes up the buddy with
<function>lxrt_resume()</function>, which executes the function that
it got through its arguments from the trap handler, and then goes to
sleep again.
</para>
<para>
&lxrt; has 16 <emphasis>expansion slots</emphasis>, that application
writers can use to plug-in their own functionality. That means, if you
have a set of functions that run in hard real-time, by the &lxrt;
extension you can make them available to user space tasks. You need to
do two things:
<itemizedlist>

<listitem>
<para>
<emphasis>In kernel space.</emphasis>
done by acquiring (at compile time of your functions) a
&ldquo;key&rdquo; from &lxrt;, and making an array of function
pointers. So, you functions can be
recognized by the &lxrt; infrastructure based on these two numbers:
(i) the &lxrt; key, and (ii) the index in the function array.
</para>
</listitem>

<listitem>
<para>
<emphasis>In user space.</emphasis> Make user space functions with the
same interface as the above-mentioned kernel space functions, by using
the <function>rtai_lxrt()</function> function that takes care of the
trap setup, discussed in a section above.
</para>
</listitem>

</itemizedlist>
The symmetry between the use of the new functionality in kernel space
and user space shows up your source files: their typical
structure is as follows:
<programlisting>
<![CDATA[
#ifdef MODULE 
 ... 
#ifdef __KERNEL__
        // kernel space functions
#else
        // user space functions
#endif

#endif 
]]>
</programlisting>
The above-mentioned functions such as
<function>rt_task_init()</function> and
<function>rt_make_hard_real_time()</function> are examples of the
&ldquo;<emphasis>encoded trap<indexterm>
<primary>encoded trap</primary></indexterm>
<indexterm>
 <primary>trap</primary><secondary>encoded</secondary>
</indexterm></emphasis>&rdquo; technique
(<xref linkend="rtai-core-int">) that is behind all of &lxrt;.
The argument passing needed in an encoded trap 
is performed in the short but somewhat enigmatic function
<function>rtai_lxrt()</function> in
<filename>include/asm-xyz/rtai_lxrt_sup.h</filename>. The
&ldquo;magic&rdquo; is due to the argument encoding used
in this function.
</para>
<para>
So, the clue of the &lxrt; procedure is to make the user space task
launch a trap handler that executes a real-time service for the user
space task; and all this is done through just one single trap handler,
by encoding the desired service. Hence, a special &lxrt; version must
be made for <emphasis>all</emphasis> &rtai; functions that one wants
to be available to user space tasks, and a unique code must be given
to each function. The codes are given in
<filename>include/rtai_lxrt.h</filename>, which also contains the
&lxrt; versions of the core &rtai; calls; non-core functionality
(FIFOs, serial communication, etc.) needs extra header files, such as,
for example, <filename>include/rtai_fifos_lxrt.h</filename>.
The function <function>lxrt_handler()</function> in the file
<filename>lxrt/lxrt.c</filename> connects the encoded service requests
with real executable calls in kernel space.
The <function>lxrt_handler()</function> does not only work with
<emphasis>tasks</emphasis> shared between user space and kernel space,
but also with <emphasis>semaphores</emphasis> and
<emphasis>mailboxes</emphasis>, via precisely the same technique:
encoding what the desired action is, in the data given to the &lxrt;
trap.
</para>
<para>
<function>dispatch_trap()</function> in
<filename>arch/xyz/rtai.c</filename> does the trap handling. If it is
a trap for &rtai; it is passed to the trap handler that &rtai; has
registered; this is done in the
<function> init_module()</function> of
<filename>lxrt/lxrt.c</filename>.
<function>lxrt_trap_handler()</function> in
<filename>lxrt/lxrt.c</filename>: catches 7 (floating point error?) and
14 (memory allocation error), and then proceeds to the basic job:
</para>
<para>
The data structure for the coded trap message from user space to
kernel space is hardware dependent, e.g., for <acronym>i386</acronym>
it is defined in <filename>include/asm-i386/rtai_lxrt.h</filename>:
<programlisting>
<![CDATA[
union rtai_lxrt_t { RTIME rt; int i[2]; void *v[2]; };
]]>
</programlisting>
It is a <parameter>union</parameter> data structure, because its
contents can have various meanings. The same file also shows that for
this hardware platform, &lxrt; chooses the trap
<parameter>int $0xFC</parameter>; this trap number is defined as:
<programlisting>
<![CDATA[
#define RTAI_LXRT_VECTOR  0xFC
]]>
</programlisting>
but also occurs directly in the assembler code that defines the data
structure for the trap:
<programlisting>
<![CDATA[
static union rtai_lxrt_t _rtai_lxrt(int srq, void *arg)
{
  union rtai_lxrt_t retval;
  __asm__ __volatile__ ("int $0xFC"
  : "=A" (retval) : "a" (srq), "d" (arg));
  return retval;
}
]]>
</programlisting>
</para>
<para>
Incorporating a user space task into the real-time scheduling is
the second &lxrt; funtionality. It is implemented by patching the
normal task switching code of &linux;.
<filename>include/asm-i386/rtai_lxrt.h</filename> defines the
hardware-dependent part. &lxrt; works with a flag that signals the
scheduler whether or not to take into account &lxrt; tasks; this flag
<programlisting>
<![CDATA[
volatile unsigned long lxrt_hrt_flags;
]]>
</programlisting>
keeps track of whether a task is running in hard real-time or not. The
flag is used, for example, in the
<filename>XYZ/scheduler/rtai_sched.c</filename> file. That scheduler
code also uses the 
<parameter>my_switch_to</parameter> function. Also
<filename>lxrt/lxrt.c</filename> uses that function, in
the <function>lxrt_schedule()</function>. That is a replacement for
the &linux; <function>schedule</function>.
<function>lxrt_schedule()</function> is used  in the
kernel thread scheduler (??) <function>kthread_b()</function>,
and in <function>give_back_to_linux()</function>
</para>
<para>
Scheduling in &lxrt;: <function>lxrt_sigfun()</function> to
<function>lxrt_schedule()</function> when getting back
to &linux; from the &rtai; schedulers;
<function>steal_from_linux()</function> to make a &linux;
process a user space hard real-time module;
<function>give_back_to_linux()</function> to return a user space
module to the &linux; tasks.
</para>

<para>
(TODO: signals for &lxrt; tasks.)
</para>

</sect2>

</sect1>


<sect1 id="rtai-lxrt-extending">
<title>Making your own extensions to LX/RT</title>
<para>
(TODO: &lxrt;/&comedi; as an example of extending &lxrt;.)
</para>
</sect1>


<sect1 id="rtai-modules-impl">
<title>Module implementations</title>
<para>
This section explains the code in the
<function>init_module()</function> functions in the various &rtai;
parts.
</para>
<para>
<function> init_module()</function> of
<filename>lxrt/lxrt.c</filename> does the following: &hellip;
</para>

</chapter>



<chapter id="cpp">
<title>&cpp; and real-time</title>

<para>
Operating systems are most often written completely in one single
language, and most often that language is &ccc;. There are (and will
be) always small, hardware-dependent parts that use assembly language,
for efficiency or feasibility reasons. But, at the other end of the
language spectrum, also object-oriented languages are being used;
sometimes in combination with &ccc; as the basic, low-level language,
<citation>Walmsley2000</citation>.
</para>

<sect1 id="c-cpp">
<title>&ccc; and &cpp;</title>
<para>
<indexterm><primary>&cpp;</primary></indexterm>
Most operating systems are programmed in &ccc;: all commercial &unix;
systems, &linux;, &nt;, &qnx;, etc. Writing operating systems in a
hardware-independent way was exactly the reason why Bell Labs created
the &ccc; language. It is not much more than an embellished assembly
programming language, but it has become a de facto standard because of
the success of the &unix; operating system. The
<emphasis>pointer</emphasis> concept of &ccc; is one of its major
advantages for writing device drivers and operating systems: it allows
the programmer to place a variable of a program onto a specific
hardware address; or to work with (doubly) linked lists of data
structures, which is a very common need in the bookkeeping tasks of
the OS; etc. Efficiency is another advantage of &ccc;: is doesn't have a
&ldquo;runtime&rdquo; (such as &cpp; or &java;) in which non-deterministic
operations take place behind the screens (e.g., dynamic allocation of
memory; garbage collection), and beyond the control of the programmer.
</para>

<para>
&ccc; does have a number of disadvantages too, of course. Modern
programmers have learned to appreciate object-oriented programming,
with its emphasis on keeping related data and functionality hidden
inside of classes with well-defined interfaces. Although a programmer
<emphasis>can</emphasis> practice the ideas of object-oriented
programming in &ccc;, the language itself doesn't support it. And a large
part of the &ccc; source files of free software projects prove that
writing &ldquo;spaghetti&rdquo; programs is way too easy in &ccc;&hellip;
</para>

<para>
So, &ada;<indexterm><primary>&ada;</primary></indexterm>
(in the 1980s already) and &cpp; (from the late 1990s) have appeared
on the radar screen of operating system programmers. Not
&java;,<indexterm><primary>&java;</primary></indexterm> or
Eiffel,<indexterm><primary>Eiffel</primary></indexterm>
or other object-oriented languages, because
&ada; and &cpp; allow to keep most of the &ccc; advantages (pointers,
efficiency) needed in operating systems. &ada; became in vogue because
the US army wanted a reliable and &ldquo;safe&rdquo; programming
language for all its real-time and embedded software. &ada; is still
mandatory for most aerospace systems (military as well as civilian).
&rtems; (<xref linkend="sect-rtems">) is a free
software &rtos; that came into being in this context.
</para>

<para>
An interesting evolution in portable and high-quality &cpp; code,
that can (sometimes) be used in real-time systems, is the
<ulink url="http://www.boost.org">Boost project.</ulink><indexterm>
<primary>Boost</primary></indexterm>
The project offers 
<ulink
 url="http://www.boost.org/libs/thread/doc/rationale.html">interesting
motivations
</ulink>
for their work on threads, locks, etc.
</para>

<para>
The three primary aspects of object oriented programming are
<orderedlist>

<listitem>
<para>
<emphasis>Encapsulation.</emphasis> The idea to encapsulate the
implementation of a class is based on various motivations:
 <itemizedlist>

 <listitem>
 <para>
To distinguish between the <emphasis>interface</emphasis> (or,
&ldquo;<emphasis>specification</emphasis>&rdquo;) the class, and the 
<emphasis>implementation</emphasis> of the operations that can be
called on the class.
 </para>
 </listitem>

 <listitem>
 <para>
The need for <emphasis>modularity<indexterm><primary>modularity</primary>
</indexterm></emphasis>,
in order to structure complex
applications designed and implemented by a team of programmers.
 </para>
 </listitem>

 <listitem>
 <para>
It offers a structure for protection and authorization.
 </para>
 </listitem>

</itemizedlist>
</para>
</listitem>

<listitem>
<para>
<emphasis>Inheritance.</emphasis> The idea that one class can inherit
properties from another class provides for a
<emphasis>natural classification</emphasis>
of classes, with a minimum of specifications.
Natural means that the software specifications of a class correspond
closely to the properties that we know from real-world objects
and/or concepts.
The inheritance relationship between classes makes one class the
<emphasis>parent</emphasis> (or &ldquo;base&rdquo;,
&ldquo;superclass&rdquo;, &ldquo;ancestor&rdquo;, etc.) of another
class. Inheritance can be used as an <emphasis>is-a-kind-of</emphasis>
(or <emphasis>is-a</emphasis>) relationship.  Inheritance comes in two
flavours:
<emphasis>interface inheritance<indexterm>
<primary>interface inheritance</primary></indexterm></emphasis>,
<indexterm>
 <primary>inheritance</primary><secondary>interface</secondary>
</indexterm>
and <emphasis>implementation inheritance<indexterm>
<primary>implementation inheritance</primary></indexterm></emphasis>.
<indexterm>
 <primary>inheritance</primary><secondary>implementation</secondary>
</indexterm>
</para>
</listitem>

<listitem>
<para>
<emphasis>Polymorphism.</emphasis> This is the idea that the same
software object can behave in different ways, depending on various
factors.
</para>
</listitem>

</orderedlist>
Not all object-oriented languages offer the full set of these
concepts, or implement them in the same way. For example, &cpp; lacks
direct support of the concept of a <emphasis>interface</emphasis>,
such that interface inheritance is always implemented by
&ldquo;workarounds&rdquo;. (TODO: how?)
</para>
<para>
None of the above-mentioned object-oriented aspects are supported in
&ccc;, which leads to the following &ldquo;problems&rdquo;:
<itemizedlist>

<listitem>
<para>
<emphasis>Lack of encapsulation.</emphasis> 
If the programming language doesn't impose or stimulate encapsulation,
any effort at trying to separate specification from implementation
that the original coder may have had, tends to be compromised very
quickly, not only by other contributors, but also by the original
coders themselves.
</para>
</listitem>

<listitem>
<para>
<emphasis>Lack of inheritance.</emphasis> 
Because &ccc; programmers have never been taught and drilled to watch
for commonalities between different software components, most of them
apply code re-use by &ldquo;copy-and-paste&rdquo; of source code lines
between components. But once a common piece of code appears in two
different places, these two pieces begin to have their own evolution,
and the bonus of having a common ancestor disappears, and the code becomes
larger. Typically, newcomers to the project don't know about the
commonality insights their predecessors had, and have much more
problems understanding the code, and will, hence, be less efficient
and more error-prone in their contributions.
</para>
</listitem>

<listitem>
<para>
<emphasis>Lack of polymorphism.</emphasis> 
This has led to the introduction of &ldquo;states&rdquo;, with Boolean
operators as the simplest form of state: the &ldquo;class&rdquo;
reacts differently to the same inputs when it is in a different state;
or, alternatively, it accepts different inputs according to its state.
This in itself is not the real issue, but in combination with the lack
of interfaces and encapsulation, the internal states of objects are
used by other software components, that begin to adapt their
interactions with the object based on the knowledge of its state. This
is a typical situation of <emphasis>high coupling</emphasis> between
software components; <xref linkend="chap-design"> explains the
pitfalls of this situation.
</para>
</listitem>

</itemizedlist>
</para>

<para>
Of course, using the above-mentioned OO aspects is not 
<emphasis>in itself</emphasis> a sufficient condition for writing good
quality software!
</para>

</sect1>


<sect1 id="cpp-linux-rtos">
<title>&cpp; in the Linux RTOSs</title>
<para>
&cpp; has kept all aspects of &ccc; that are useful in the
implementation of software that works in close interaction with the
hardware: pointers to hardware addresses being a major features.
However, &cpp; also has two parts whose execution is non-deterministic:
(i) dynamic object creation and deletion, and
(ii) the Run-Time Type Identification (RTTI) feature.
The good news is that both parts are reasonably easy to avoid, because
there is even compile-time support from the compilers to disable these
non-deterministic parts. The bad news is that most of this
functionality is used deeply behind the screens of object creation and
deletion, and exception handling; and few programmers have been
trained in spotting these points in their &cpp; code.
</para>
<para>
Both &rtlinux; and &rtai; have growing support for &cpp; in
real-time, but the majority of their programmers and code are not
&ldquo;&cpp;-ready&rdquo;. &ecos;, on the other hand, has been
written completely in &cpp; from scratch, and hence all of its
contributors must master the &cpp; basics.
</para>
<para>
&rtai; (<xref linkend="rtai-specific">) allows to use &cpp; in kernel
space. But this does <emphasis>not</emphasis> mean that one can use
&linux; kernel functions from &cpp;. That will
most likely cause problems when trying to include &linux; kernel
header files into &rtai; &cpp; files, and similarly with &rtai; header
files. To get the functionality that is needed for the
<function>rtai_cpp</function> classes, some wrapper file was
&ldquo;hacked&rdquo;, to deal with just a very few
problem headers. This file does not give full &linux; kernel
functionality to &cpp; programs, though.
So, in order to use a function from &linux;, one needs to wrap it in
an <function>extern "C"</function> function.
</para>

</sect1>

</chapter>


<chapter id="cross-compilation">
<title>Cross compilation, debugging and tracing</title>

<para> 
This Chapter explains the basic principles behind developing code for
another platform than the development platform, loading code to that
platform, making it boot autonomously, and debugging it. Tracing of
the execution of a running embedded or real-time system is another
important tool to assess the behaviour of an application in its whole.
</para>

<sect1 id="cross">
<title>Cross development</title>
<para>
(TODO: How? What hardware support needed? )
</para>

</sect1>


<sect1 id="debug">
<title>Debugging</title>
<para>
(TODO: host + target, remote debugging, BDM, S-records for serial
communication,
<function>rt_printk()</function> (explain where its output ends up).
</para>

</sect1>


<sect1 id="ltt">
<title>Linux Trace Toolkit</title>
<para>
(Excerpt from the documentation of &ltt;.)
The &linux; operating system is a multiprogramming, multiuser system.
This means that it is able to handle multiple running programs at
once. On a uniprocessor system (a computer with only one
microprocessor), this is achieved by sharing this precious resource
among multiple tasks, letting each execute for a certain period of
time and then switching to another. The selection and switching of
processes is handled by the &linux; kernel, which also is a program
and therefore also needs some time from the processor. It is also
responsible for fulfilling certain requests by the programs it
manages, dealing with error conditions, etc.  One could have the need
to know exactly what these schedulingdecisions, process switches and
various management tasks are, how they are handled, how long they take
and to which process the processor is allocated. Spread out over a
certain period of time, we call this an execution trace.
</para>
<para>
The 
<ulink
  url="http://www.opersys.com/LTT/">Linux Trace Toolkit
</ulink>,<indexterm>
<primary>Linux Trace Toolkit</primary></indexterm>
<indexterm>
 <primary>tracing</primary><secondary>Linux Trace Toolkit</secondary>
</indexterm>
is a suite of tools designed to do just that: extract program
execution details from the &linux; or &rtai; operating systems and
interpret them. Specifically, it enables its user to extract processor
utilization and allocation information for a certain period of time.
It is then possible to perform various calculations on this data and
dump this in a text file. the list of probed events can also be
included in this. The integrated environment can also plot these
results and perform specific searches.
</para>
<para>
&ltt; works by inserting tracing commands into the source code.
This requires a &ldquo;patch&rdquo;, and the extra instructions slow
down the execution a little bit.
</para>
<para>
(TODO: more details.)
</para>

</chapter>

</part>

<!-- =====================P=A=R=T==III============================= -->

<part id="part3">
<title>Design</title>

<partintro>
<para> 
The first Parts of this text dealt with
<emphasis>functionality</emphasis>; this Part is about
<emphasis>structure</emphasis>.
</para>
<para>
The previous Chapters described real-time operating system
concepts, their
<emphasis>general-purpose functionalities</emphasis>, and some of
their implementation aspects. The following Chapters give information,
examples and hints about how to
<emphasis>design<indexterm><primary>design</primary>
</indexterm></emphasis>
applications on top of this raw &rtos; functionality.
Indeed, design is not about adding as many &rtos; features in your
application as you can, or about using the first &api; function you find
that could be used to solve your current implementation problem.  No,
design is all about making the <emphasis>logical structure</emphasis>
of your particular application as explicit and clear as possible, and
on searching hard to reduce the number of &rtos; features needed to
implement that application logic. So, design is, by definition,
always driven by <emphasis>application-specific</emphasis> criteria,
and hence no &ldquo;general purpose&rdquo; real-time system design
exists. Nevertheless, there <emphasis>are</emphasis> lots of generic
design issues, that have received neat solutions that all application
programmers should be familiar with. Examples treated in later
Chapters of this Part are: loose coupling, components, application
architectures, software patterns, and frameworks.
</para>
<para>
The observation from which to start a discussion on design, is that
(mature) application domains have, over the years, developed a
relatively fixed <emphasis>structure</emphasis> of cooperating tasks.
But, typically, the <emphasis>functionality</emphasis> of some of the
tasks changes more quickly: features are added,
alternative implementations of functionality are tried, new hardware
or communication protocols are supported, etc. Examples of such
mature domains are: telecom, motion control of machines, setpoint
control of processes, networking, or data bases. Except for the last
domain, real-time and embedded aspects are very important. Hence, it's
the goal of this Part to describe <emphasis>good designs</emphasis>
for the basis <emphasis>structure</emphasis> code in these application
domains. The domain of <emphasis>general feedback control and signal
processing</emphasis> is taken as an example. (And similarly worked-out
examples for other domains are very welcome!) The good news is that most
application domains have a lot of very similar basic needs. The
Software Engineering community is working hard to capture these in
so-called <emphasis>Software Patterns</emphasis>; the ones relevant to
real-time and embedded systems are presented later in this Part.
</para>
<para>
Most of the material addressed in this Part is not unique to
real-time or embedded systems. But real-time and embedded systems tend
to be more affected by the &ldquo;holy grail of efficiency&rdquo;,
which is one of the major causes of poorly structured and hence
poorly maintainable code: programmers make &ldquo;shortcuts&rdquo;
that mix <emphasis>functional aspects</emphasis> of the
<emphasis>application</emphasis> with <emphasis>structural</emphasis>
aspects of the application's <emphasis>architectural
design</emphasis>. And mixing function and structure makes
implementations much more messy.  We <emphasis>do</emphasis> pay
attention to efficiency, however. And the focus remains on real-time
systems, which means that 
<emphasis>scalability</emphasis> comes second: no-one expects a
real-time application to be scalable to the same extent as other IT
applications, such as web or database serving. But a well-designed
project has a <emphasis>structure</emphasis> in which it is clear
which parts are scalable and which are not.
</para>

</partintro>


<chapter id="chap-design">
<title>Design principles</title>

<para>
<indexterm><primary>design</primary></indexterm>
The message of this Chapter is: a well-designed software project has a
clear and documented
<emphasis>decoupling between structure and functionality</emphasis>.
<xref linkend="sect-func-struct"> explains what
&ldquo;structure&rdquo; and &ldquo;functionality&rdquo; mean, and
<xref linkend="sect-coupling"> says how to separate them. Components
(<xref linkend="sect-components">) are the modern software engineering
approach to build software projects that use cleanly separated
structure and functionality. The architecture of the system
(<xref linkend="sect-architecture">) defines how the available
components are to be connected together, and explains why that
particular choice should be made.
</para>
<para>
This Chapter only talks about the <emphasis>theory</emphasis> of good
software design; the following Chapters illustrate this theory with
applications that are relevant to the scope of this document.
</para>


<sect1 id="sect-func-struct">
<title>Structure and functionality</title>

<para>
<emphasis>Functionality<indexterm>
<primary>functionality</primary></indexterm></emphasis> is the set of
all algorithms needed to perform the purpose of the application;
<emphasis>structure<indexterm><primary>structure</primary>
</indexterm></emphasis> is the way in which the algorithms are
distributed over tasks, which tasks have to exchange data, and how
they have to be synchronized.
In large software projects, the division between
structure and functionality is important, because few of the
contributors will be able to grasp the whole software project and
predict the consequences of the code they add to the project.
Therefore, the &ldquo;senior&rdquo; project people should define and
code the project's infrastructure, in which all contributors can add
functionality while having to consider only a limited part of the
project. This idea is already wide-spread, because large projects
such as Mozilla, Gnome, the &linux; kernel, Windows 2000, etc., all
work (more or less) along these lines.  However, a lot of the
real-time developments (outside of specialized companies) are done in
small, isolated groups, where often the majority of developers are not
computer scientists but specialists in the application domain; and new
students in the field are concentrating more on understanding the
real-time primitives than on learning to design software applications.
This often leads to &ldquo;spaghetti code&rdquo;, and abuse of the
available real-time and &ipc; primitives.
</para>
<para>
These are some examples of large scale applications, with major
real-time and embedded needs:
<itemizedlist>

<listitem>
<para>
<emphasis>Communication networks</emphasis>: examples of structure in
this domain are the <emphasis>OSI</emphasis> 7-layer model, the
principle of <emphasis>name servers</emphasis>, the
<emphasis>CORBA<indexterm><primary>&corba;</primary></indexterm>
</emphasis> specifications, etc. They all decouple the
infrastructure of sending packages with data from the meaning of these
packages in the context of an application. Telecom is one of the best
examples of time-prove designs (and of the importance of 
<emphasis>open standards</emphasis> to make optimal use of these
designs): everybody takes it for granted that telephone systems do not
crash, but few people realise the magnitude of software components
involved in the process.
</para>
</listitem>

<listitem>
<para>
<emphasis>Control systems<indexterm>
<primary>control systems</primary></indexterm></emphasis>. Many
devices and even whole plants are controlled by computers. These
systems have a wide variety of <emphasis>functionalities</emphasis>:
oil refinery, milling tools, medical apparatus, laser
&ldquo;guns&rdquo; in discotheques, laser printers, etc. But they all
have the same basic components, well-known and thoroughly studied in
engineering sciences such as systems and control theory. The
generic <emphasis>structure</emphasis> of all these applications is
that of <emphasis>feedback control</emphasis>: one component generates
a signal that represents the desired value(s) of one or more of the
physical signals in the system; one component measures these
physical signals; aenother component derives values of other signals
that are not directly measurable; one components steers the inputs to
&ldquo;actuators&rdquo; that can change the value of (some of) the
relevant physical signals. <xref linkend="chap-control"> gives a more
detailed description.
</para>
</listitem>

</itemizedlist>
The control application is used in the following Chapters to
illustrate the theory of good design with practical examples.
</para>


<sect1 id="sect-coupling">
<title>Loose coupling</title>
<para>
Only the simplest of applications can be programmed as one single
task, with nothing else but straightforward communication with
peripheral hardware. Most other projects need multiple components, and
hence synchronization and data exchange between them. Many developers
make the mistake of putting too much <emphasis>coupling</emphasis>
between their components. That means that, in the implementation of
one software component, they use knowledge about the implementation
internals of other software components, or about the specific
architecture (and operating system!) in which the components are used.
Some typical examples are:
<itemizedlist>

<listitem>
<para>
A task in one component suspends and restarts a task in another
component. This implicitly means that the first task knows (or rather,
pretends to know!) when it is appropriate and safe to influence the
activities of the other task.
</para>
</listitem>

<listitem>
<para>
Component A uses a <emphasis>finite state machine</emphasis> to
structure its internal working; component B bases its interaction with
component A on deciding which state it wants A to go to. This means
that the implementation of A cannot be changed without changing B
also.
</para>
</listitem>

<listitem>
<para>
Task X delays itself during 100 milliseconds, in order to allow
task Y to be scheduled and get started. This means that the
&ldquo;proper&rdquo; synchronization of X and Y depends on platform
and configuration dependent timing; and this timing tends to change
drastically when functionality is added, or hardware is updated.
</para>
</listitem>

<listitem>
<para>
There is only one interrupt service routine in the application, and
the programmer doesn't do the effort of splitting its implementation
into a <emphasis>real</emphasis> &isr; (that does nothing but the
really essential processing) and a &dsr; (that takes care of the
further processing of the interrupt, <xref linkend="sect-idsr">).
Again, this is a software structure that is error prone when updating
the system or migrating it to other hardware.
</para>
</listitem>

<listitem>
<para>
Application programmer S raises the priority of a task, because it
doesn't run &ldquo;fast enough&rdquo; within the current situation.
She also switches to <emphasis>priority inheritance</emphasis>
(<xref linkend="sect-prior-inv">) for some of the system's critical
section. Her colleague H adds his part of the system, and also feels
the need to raise the priorities of &ldquo;his&rdquo; tasks and
critical sections. This phenomenon reflects an implicit use of
knowledge about the operating system, and, in practice, often leads to
a race that eventually ends in most tasks running at the highest
priorities, which, of course, reduces the usefulness of priority-based
scheduling.
</para>
</listitem>

</itemizedlist>
(TODO: make this list of bad examples as exhaustive as possible.)
</para>
<para>
The solution to these coupling problems is, of course, quite simple:
<emphasis>avoid every form of coupling</emphasis>. Or rather, strive
for <emphasis>loose coupling</emphasis>,<indexterm>
<primary>loose coupling</primary></indexterm>
<indexterm>
 <primary>coupling</primary><secondary>loose</secondary>
</indexterm>
because complete decoupling is only possible for tasks that have
nothing to do with each other. This loose coupling advice, however,
is difficult to translate into concrete guidelines. It's one of these
things that make good (software) engineering stand out from the rest,
and makes program design into an &ldquo;art&rdquo;. Understanding what
exactly causes the problems in the list of examples given above, is
already a good beginning; as is understanding the relevant software
patterns in <xref linkend="chap-patterns">. Looking at your
application in an object-oriented way also helps a lot: the
fundamental reason behind the successful use of objects is exactly
their ability to clearly <emphasis>decouple</emphasis> structure
(&ldquo;class hierarchies&rdquo;) and functionality
(&ldquo;object methods and data&rdquo;). And designing a software
system in an object-oriented way is independent of the language(s)
used in the implementation of the system. A good design can also be
implemented in &ccc; (if the programmers are disciplined enough).
Anyway, there is more to a software system than describing which
objects it should use&hellip;
</para>

</sect1>


<sect1 id="sect-mediator">
<title>Mediator</title>
<para>
A <emphasis>mediator<indexterm>
<primary>mediator</primary></indexterm></emphasis>
is a major design principle to introduce
<emphasis>loose coupling</emphasis>
(<xref linkend="sect-coupling">) into the interaction between two or
more components.
&ldquo;Mediator&rdquo; means &ldquo;object in the middle&rdquo;
between two interacting objects.
And &ldquo;interaction&rdquo; means, at least, both
<emphasis>synchronization</emphasis> and
<emphasis>data exchange</emphasis>, i.e., &ipc;.
The mediator takes care of the decoupling in multiple ways:
<itemizedlist>

<listitem>
<para>
<emphasis>Service name serving</emphasis>: the components need not
know each other's identity, but just have to know the name of the
mediator they have to interact with, in order to get the service they
are looking for in the interaction.
</para>
</listitem>

<listitem>
<para>
<emphasis>Data handling</emphasis>: a general interaction involves the
exchange of data between the interacting components. This data resides
in the object or component that implements the mediator, and the
interacting components access the data through methods in the
mediator's interface. Only in this way, the mediator can guarantee
data consistency. Hence, the implementation of the mediator software
pattern will itself be based one some sort of
<emphasis>monitor</emphasis>
(or &ldquo;<emphasis>protected object</emphasis>&rdquo;) software
pattern (<xref linkend="sect-monitor">).
</para>
</listitem>

<listitem>
<para>
<emphasis>Synchronization</emphasis>. Instead of distributing over all
the interacting components the information about how to synchronize
their mutual interaction, it's only the mediator that has to know
about the synchronization needs in the whole system of interacting
components. And it's only with the mediator that each component
interacts. This means that a &ldquo;graph-like&rdquo; interaction
structure is replaced by a much simpler &ldquo;star-like&rdquo;
interaction, with the mediator in the centre.
</para>
</listitem>

</itemizedlist>
In summary, the mediator pattern is one particular
<emphasis>policy</emphasis> to use the <emphasis>mechanism</emphasis>
offered by the monitor pattern. In addition, 
the monitor patterns makes use of several
(purely) <emphasis>synchronization</emphasis> mediators. This is not a
contradiction, or a case of circular reasoning: mediator and monitor
patterns exist in different levels of complexity.  It's the more
complex form of the one that makes use of the simpler form(s) of the
other.
</para>
<para>
The following Chapters introduce various examples of mediators. With a
small stretch of the imagination, all
synchronization (<xref linkend="ipc-synch">) and
data exchange (<xref linkend="ipc-dataexchange">) &ipc; primitives can
be considered to be mediators; however, in their implementation this
insight has rarely been taken into account.
</para>

</sect1>


<sect1 id="sect-components">
<title>Components</title>
<para>
A <emphasis>component<indexterm>
<primary>component</primary></indexterm></emphasis> is a unit of
software with a clearly specified
<emphasis>interface<indexterm>
<primary>interface</primary></indexterm></emphasis>, that describes
its functionality. This functional interface makes a software component into an
independently reusable piece of code, i.e., something that can be used
as a &ldquo;service&rdquo;. It's this independent server property that
distinguishes a component from an object class. Here is a list of
things that a component <emphasis>can</emphasis> (but need not) do:
run several tasks and &ipc; channels internally; 
interrogate other components to find and use their interfaces; be
interrogated itself; and generate and handle events. Hence, a
component is deliverable by third parties on the basis of its
interface specification only. In principle, it can even be delivered
as a binary. A component should not only document the interface it
offers to the world, but also the interfaces it requires from other
components in order to do its job.
</para>

<para>
The difference between (or rather, the complementarity of)
the concept of <emphasis>components</emphasis> and the concept
of <emphasis>object-oriented programming</emphasis> is that
object-oriented programming talks only about
<emphasis>static</emphasis> structure, and not about concurrency and
system semantics. Even the most compilers for object-oriented
languages cannot impose that, for example, the
<function>initialize()</function> method of an object must be called
<emphasis>before</emphasis> its <function>run()</function> method.
Synchronization constraints in a system also don't enter the scope of
object-oriented programming.  For example,
<function>method x()</function> must be executed every millisecond,
but only when <function>method y()</function> has run successfully.
</para>

<para>
Different implementations of the same interface can focus on different
optimizations: efficiency of execution or of memory usage, proven
absence of deadlocks, etc. In a typical application, components of
different types are needed, and must be integrated.  A component-based
design facilitates the distribution of the implementation work over
several contributors, it improves portability, and facilitates the
integration with, and re-use by, other projects.
</para>

<para>
So much for the theory&hellip; This theory seems to work quite well in
the context of &ldquo;normal&rdquo; applications (business
transactions processing, web services, etc.), but in the context
of real-time and embedded systems, the classical component interface
lacks two important specifications:
<itemizedlist>

<listitem>
<para>
<emphasis>Timing and synchronization needs.</emphasis> It's next to
impossible to guarantee in a component interface what the exact
behaviour of the component will be if it is integrated into other
people's real-time application.
</para>
</listitem>

<listitem>
<para>
<emphasis>Memory needs.</emphasis> The memory footprint of the
component binary in itself is not sufficient information for its safe
integration into an embedded system: the component could use much more 
heap and stack space during its operation than expected.
</para>
</listitem>

<listitem>
<para>
<emphasis>Composition of components.</emphasis>
&ldquo;Glueing&rdquo; two components together doesn't, in general,
result in a new component. That is, the composite
&ldquo;component&rdquo; doesn't offer a new interface consisting of an
ordered sequence of completely specified interactions and properties.
It is for this reason that the development of concurrent programs is
an error-prone process. Especially in real-time systems.
</para>
</listitem>

</itemizedlist>
The conclusion is <emphasis>not</emphasis> that the idea of components
should be abandoned for real-time and embedded system; only that the
use of binary delivered third-party components is cumbersome. The
advantages of specifying clear interfaces of parts of a system remains
a useful design guideline, also for real-time and embedded application
programmers, and even within a project that has no externally
delivered components. A real-time system should have a good description
of which components it offers (i.e., describing their
<emphasis>functionality</emphasis> in a documented interface), and how
their <emphasis>internal</emphasis> subcomponents are interconnected
(i.e., the <emphasis>structure</emphasis> of the system).
This mostly boils down to deciding which tasks to use, and what their
synchronization and data exchange needs are.
</para>
<para>
The pessimistic view is that this specification of the internal
structure violates the previously introduced guideline for loose
coupling, and that this violation is often unavoidable. The
optimistic view is that this internal structure is a
<emphasis>natural part</emphasis> of a good software component: as
said before, and as explained in more detail in
<xref linkend="chap-patterns">, a mature software solution for a
particular problem has a natural structure which has proven to be the
best. Hence, revealing this structure should not be seen as a negative
compromise.
</para>

</sect1>


<sect1 id="sect-architecture">
<title>Architecture</title>
<para>
The architecture<indexterm><primary>architecture</primary></indexterm>
of a software system is all about choosing its
<emphasis>structure</emphasis>: it is a <emphasis>specific
connection</emphasis> between the software components in the system.
That is, an architecture makes a specific choice of how data and
signals must travel between components. And, in general, other
structures may be possible.
</para>

<para>
The best-known example of a system architecture is probably the
<emphasis>hierarchical architecture</emphasis>: data and signals flow
from layer to
layer in the hierarchy. This architecture has the advantage of being
very transparent and hence understandable, but the disadvantage of
being inflexible. Many projects start this way, but then try to
&ldquo;patch&rdquo; the architecture later on, when trying to work
around the inflexibility. An example of an often encountered
inflexibility is that the highest level in the hierarchy is, strictly
speaking, not allowed to investigate the status of the components in
the lowest layer directly, and must pass through all intermediate
levels.
</para>

<para>
So, in the design phase one should try to postpone decisions about the
architecture as long as possible, and to design the components in such
a way that their functioning does not depend on a particular choice of
architecture. It is indeed almost certain that the requirements of the
software project will change during its lifetime, and architectures
are often the most difficult to adapt aspect of a software system.
Especially so when the original developers did not provide an explicit
description and motivation for the system's architecture.
</para>

</sect1>

</chapter>


<chapter id="chap-patterns">
<title>Patterns and Frameworks</title>

<para>
This Chapter defines what <emphasis>software patterns</emphasis> and
<emphasis>software frameworks</emphasis> are. It describes some common
patterns that are relevant to real-time systems: monitors, events and
state machines, and mediators such as &ldquo;producer-consumer&rdquo;
and &ldquo;execution engine&rdquo;. The motion control
<emphasis>framework</emphasis> is developed in more detail in
<xref linkend="chap-control">, with the presented 
software patterns as &ldquo;medium-level&rdquo; components, and the
&rtos; primitives of <xref linkend="part1"> as &ldquo;low-level&rdquo;
programming primitives.
</para>


<sect1 id="sect-patterns-defs">
<title>Definitions</title>

<para>
A <emphasis>Software Pattern<indexterm>
<primary>software pattern</primary></indexterm></emphasis>
(<citation>gof94</citation>, <citation>posa96</citation>)
is a <emphasis>proven, non-obvious, and constructive solution to a common
problem in a well-defined context</emphasis>. This solution is the
result of many years of experience dealing with the often delicate
interactions and trade-offs (&ldquo;forces&rdquo;) that each tend to
drive the solution into different directions. A Pattern describes the
interaction between a group of components, hence it is a higher-level
abstraction than classes or objects. It's also
<emphasis>not</emphasis> an implementation, but a textual description
of a solution and its context.
</para>
<para>
<xref linkend="chap-design"> introduced already an important software
pattern: the <emphasis>mediator</emphasis>. It takes care of the
<emphasis>decoupling</emphasis> of the interaction between two or more
components.
</para>

<para> 
A <emphasis>framework<indexterm>
<primary>framework</primary></indexterm></emphasis>
(<citation>Johnson97</citation>) is <emphasis>a set of computer code
files that implement a reusable software solution for a complete but
particular problem domain.</emphasis> Important words in this
definition are &ldquo;implement&rdquo; and &ldquo;particular&rdquo;: a
framework is <emphasis>code</emphasis>, which only need some so-called
&ldquo;<emphasis>hot spots<indexterm>
<primary>hot spot</primary></indexterm></emphasis>&rdquo; to be filled
in before it works. These hot spots are system-dependent parts of the
software, such as: particular device drivers or user interface code.
</para>
<para>
A framework is much broader
(&ldquo;programming in the large&rdquo;) than a software pattern
(&ldquo;programming in the small&rdquo;). A framework typically
contains several patterns, but a pattern doesn't contain frameworks; a
framework contains code, a pattern doesn't. Frameworks are
constructed in such a way that similar applications within the same
domain can benefit from the same structures and abstractions, but may
need re-implementation of the &ldquo;hot spots&rdquo; in the
framework.
</para> 

</sect1>


<sect1 id="sect-monitor">
<title>Monitor<indexterm><primary>monitor</primary></indexterm></title>
<para>
The <emphasis>monitor</emphasis> is one of the older software
patterns, developed for more complex mutual exclusion jobs than what
the are &rtos; primitives can deliver. Getting it working in a
real-time application in a time-deterministic way, however, is not
straigthforward, and its scalability is much worse than linear in the
number of tasks, resources, and access conditions to be synchronized.
</para>
<para>
Semaphores (as well as all other locking mechanism discussed in the
previous Chapters) are
<emphasis>primitive (&ldquo;low-level&rdquo;) tools</emphasis>:
programmers have to get the logic of using
<function>sem_signal()</function> and <function>sem_wait()</function>
(<xref linkend="sect-semaphore">)
calls correct. One single mistake most often means an incorrectly
functioning system. But, more importantly, using the locking
primitives in this way also violates the
<emphasis>loose coupling</emphasis><indexterm>
<primary>loose coupling</primary></indexterm> principle of good
software design (<xref linkend="chap-design">): the synchronization
is achieved by spreading lock function calls over the different tasks
that take part in a mutual exclusion or any other synchronization.
This distribution of the lock function calls makes later maintenance
or updating more difficult and error-prone, because the maintainers
should not forget to update <emphasis>all</emphasis> files in which
the locks are used.
</para>

<para>
The solution that lies at the basis of the monitor pattern, is to
avoid this spreading of locks by 
<emphasis>keeping them all at one place</emphasis>, protected
in the internals of one single so-called &ldquo;monitor&rdquo;
(or &ldquo;<emphasis>protected object<indexterm>
<primary>protected object</primary></indexterm></emphasis>) that
delivers the <emphasis>serialization<indexterm>
<primary>serialization</primary></indexterm></emphasis>
service to client tasks. And, moreover, the monitor does this
serialization in a quite specific fashion: it makes sure that, at any
given time, <emphasis>only one single client task</emphasis> can
execute any of the <emphasis>set</emphasis> of function calls that the
monitor &ldquo;protects.&rdquo; Or, in more modern object-oriented
terminology: if one task calls a member function of
the monitor, then it cannot be interrupted by another task that
wants to call a member function of the same monitor.
This aspect of mutual exclusion at the function calling level is the
really new <emphasis>synchronization primitive</emphasis> that the
monitor brings, in comparison with the classical &ipc; primitives
found in traditional operating systems. (They provide mutual exclusion
at the statement execution level.)
</para>

<para>
The monitor concept is another example of the
<emphasis>mediator<indexterm>
<primary>mediator</primary></indexterm></emphasis> idea
(<xref linkend="sect-mediator">):
<itemizedlist>

<listitem>
<para>
It mediates an intricate <emphasis>synchronization</emphasis> between
several tasks.
</para>
</listitem>

<listitem>
<para>
None of these tasks has to know the name or anything else about the
other tasks involved in the synchronization.
</para>
</listitem>

<listitem>
<para>
Part of the monitor could be the <emphasis>protection of shared
data</emphasis> between the interacting tasks.
</para>
</listitem>

</itemizedlist>
</para>

<para>
Obviously, a monitor needs locks internally to do the synchronization
and the bookkeeping of the client tasks that request its services. But
the advantage of the monitor is that it can keep all this bookkeeping
inside its own code and that much of this bookkeeping must only 
be programmed once. So, the overall application code is easier to
understand, predict, debug and maintain. The price paid for the
convenience of a monitor is that: (i) it requires more complex (and
hence slower) code; (ii) the exclusive execution within the monitor
can lead to substantial delays for the client tasks that have to wait
outside the monitor; (iii) using a monitor inside another monitor
implies <emphasis>nested<indexterm>
<primary>nested critical section</primary></indexterm>
<indexterm>
 <primary>critical section</primary><secondary>nested</secondary>
</indexterm></emphasis> critical sections, and this leads to a
deadlock sooner or later; and (iv) a monitor is difficult to
distribute over a network, because it needs shared memory for the
semaphores that it uses behind the screens.
</para>

<para>
The general monitor concept is not available as system call in any
operating system, because, as will become clear later, the syntax
of a OS function call is not sufficient to describe the full semantics
of a general monitor. There are a number of runtime support libraries
(such as the Mozilla NSPR library) and programming languages (such as
&ada; and &java;) that offer monitors. But also in these cases, only a
limited version of the monitor idea can be offered as a language or
runtime primitive. Ideally, one would like to use a programming
language syntax as shown in the pseudo-code skeleton of a monitor
below:
<programlisting>
<![CDATA[
monitor {  // begin of protected scope
mutex          monitor_mutex;// monitor-wide mutex to allow only one task
                             // to access the procedures below

data           shared_data;  // data structure protected by the monitor

mutex          cond1_mutex;  // first application-specific condition
pthread_cond_t cond1;        // variable and its mutex

mutex          cond2_mutex;  // second application-specific condition
pthread_cond_t cond2;        // variable and its mutex


// procedures that operate on the "shared_data":
procedure_1 {
  ...   // this procedure uses one or more of the application-specific
        // condition variables, to synchronize access to the
        // "shared_data"
}

procedure_2 {
  ...   // also this procedure uses one or more of the 
        // application-specific condition variables
}

}  // end of protected scope
]]>
</programlisting>
Of course, such an ideal syntax is not supported by operating systems.
And the full implementation of a monitor in an operating system will
be quite a bit different from the code above. But for the time being,
the focus is on <emphasis>meaning</emphasis>, and not on
<emphasis>syntax</emphasis>.
So, a small example of an application that needs a monitor will
hopefully help to make the discussion more concrete. The monitor
object could be your bank account: the protected data structure is the
unique resource (your money on the bank), and the method calls on that
data structure are the classical things, such as redraw money, deposit
money, check the available amount, etc. It is clear that 
clients should be allowed to access the bank account resource only one
by one. For example, the husband checks the account, sees that it
contains 1000 euros, and withdraws 800 of them. He should be
guaranteed that, as soon as he was allowed to perform his first
operation (check the account), he would be certain to proceed without
another client coming in and changing the state of the account. For
example, his wife also checking the account and withdrawing 800 euros.
The monitor-wide mutex <parameter>monitor_mutex</parameter> is
involved in allowing the husband to do his first operation, but
disallowing his wife access to the monitor until he has finished is
an <emphasis>application-specific condition</emphasis>.
Of course, it is not difficult to replace the bank account scenario
with any similar scenario in your real-time or embedded application
where a shared resource must be accessed according to the monitor
semantics. For example: an automatic clothing manufacturing cell has
several machines that need material from the same textile sheet
cutting machine; in order to guarantee color consistency of each
single piece of clothing, each machine must be sure it can get all of
the pieces it needs from the same textile batch.
</para>
<para>
Although far from complete, the ideal code fragment above does
represent the essential aspects of applications that need the
synchronization services of a monitor:
<itemizedlist>
<listitem>
<para>
The monitor has multiple
&ldquo;<emphasis>access points</emphasis>&rdquo;: client tasks can
call (i.e., ask to be allowed to execute) each of the procedures in
the monitor, in any sequence, and at any time.
</para>
</listitem>

<listitem>
<para>
Only <emphasis>one client task</emphasis> should, at any given time,
effectively be allowed to execute some code of the procedures within
the monitor, irrespective of how many clients have requested to do so.
This mutual exclusion is the goal of the monitor-wide mutex
<parameter>monitor_mutex</parameter>: it is used to allow only one
task to enter what we will call the monitor's
&ldquo;activity zone.&rdquo;
</para>
</listitem>

<listitem>
<para>
In addition to the monitor-wide mutex
<parameter>monitor_mutex</parameter>, the different procedures in the
monitor have <emphasis>mutual critical sections</emphasis>. Otherwise,
putting them inside the same protected object would be an unnecessary
complication.  So, in general, a client task that is active in the
monitor's &ldquo;activity zone&rdquo; (and, hence, runs in the
critical section protected by <parameter>monitor_mutex</parameter>)
<emphasis>can block</emphasis> on a task-specific condition. This
facility to block when running inside of a critical section should
sound familiar by now: it's the essence behind the introduction of
<emphasis>condition variables</emphasis> on the operating system scene
(<xref linkend="sect-condvar">). So, it comes as no surprise to see
condition variables appear in the monitor procedures.
</para>
</listitem>

<listitem>
<para>
Once a client task is in the monitor's &ldquo;activity zone&rdquo;, it
cannot be forced by other client tasks to leave that zone: it must
leave it voluntarily. Otherwise, allowing the operating system or the
runtime to force a task to stop
anywhere in the code compromises the logical consistency of the
monitor. Of course, the task can be pre-empted by the scheduler, and
continue later on. However, this delays <emphasis>all</emphasis>
client tasks of the monitor, because non of the other tasks is allowed
to proceed in the activity zone. So, a monitor should be used with a
lot of care in a real-time environment!
</para>
</listitem>

<listitem>
<para>
So, the <parameter>monitor_mutex</parameter> protects the
&ldquo;activity zone&rdquo; of the monitor. This zone is not
explicitly visible in the code fragment above: it is a lock on a
<emphasis>task queue</emphasis>, that would be behind the screens of a
monitor implementation in a programming language: each client task
that executes a method call on the object is blocked on this task
queue, until it is allowed access to the &ldquo;activity zone.&rdquo;.
In a (real-time) OS context, this task queue cannot remain behind the
screens, so the application programmer will have to
make this queue visible (see below). 
</para>
</listitem>

<listitem>
<para>
<parameter>monitor_mutex</parameter> is a <emphasis>generic</emphasis>
lock, that is part of every monitor structure, independently of the
application that it protects. But the condition variables are
<emphasis>application-specific</emphasis>. And that's the 
reason why a monitor can, in general, <emphasis>not</emphasis> be
implemented as an operating system or runtime primitive: applications
differ so much in their synchronization needs within the monitor
procedures, that it is impossible to let users blindly fill in their
procedures and condition variables as parameters in such a system
call, with the guarantee that client tasks will only be active within
the monitor one by one <emphasis>and</emphasis> according to the
synchronization semantics of the application task. No, the monitor
procedures must be carefully designed together, and this design cannot
but make use of the error-prone and low-level operating system
synchronization primitives discussed before. However, the advantage
remains that all these primitives are used within one single
programming scope.
</para>
</listitem>

<listitem>
<para>
A monitor can be implemented as an abstract data type (i.e., using
nothing but function calls on the monitor, as in the pseudo-code
above), but also as an active component (i.e., in which each of the
function calls above is replaced by a task that executes the function). 
The differences are that, in the active component version, the
procedures run in the context of the tasks in the active monitor
component, and the monitor clients could, in principle, continue doing
something else after they have send their service request to the
monitor. For example, in the bank account scenario above, the husband
could use an internet banking application to prepare a set of bank
account operations, send them to his bank in one batch operation,
without waiting for the response from the bank server.
</para>
</listitem>

</itemizedlist>
Because a task can block once it has been allowed into the monitor's
activity zone, the monitor implementation becomes necessarily a bit
more complex than the following simplistic solution that using nothing
more but the monitor-wide <parameter>monitor_mutex</parameter>:
<programlisting>
<![CDATA[
mutex_lock(&monitor_mutex);
switch (proc} {
  case (proc == procedure_1):
     ... // execute code of procedure_1
     break;
  case (proc == procedure_2):
     ... // execute code of procedure_2
     break;
}
mutex_unlock(&monitor_mutex);
]]>
</programlisting>
This solution is simplistic, because, as said before, allowing a task
to block in the <parameter>monitor_mutex</parameter> critical section
can lead to deadlocks. So, the implementation of the monitor must make
sure, &ldquo;behind the screens&rdquo; of what is visible in the code,
that:
<orderedlist>
<listitem>
<para>
the task that is currently in the &ldquo;activity zone&rdquo;, leaves
that zone to wait on a condition variable.
</para>
</listitem>

<listitem>
<para>
that task does <emphasis>not</emphasis> leave the monitor, because it
is not yet finished with its monitor procedure and it holds a resource
(i.e., lock) of the monitor which should not be exported to outside of
the monitor.
</para>
</listitem>

<listitem>
<para>
it allows another task into its &ldquo;activity zone.&rdquo;
</para>
</listitem>

</orderedlist>
The section on condition variables (<xref linkend="sect-condvar">)
showed that the condition variable primitive has exactly been
introduced to allow a task to block within a critical section locked
by a mutex. But in themselves, the normal condition variables are not
sufficient: they work only within the code of one single procedure,
and cannot span the scope of several procedures. Hence, the
implementation of a monitor will be more complex than just using the
<parameter>monitor_mutex</parameter> as the mutex of the
required condition variables. The <parameter>monitor_mutex</parameter>
and the condition variables must be <emphasis>integrated</emphasis>,
in an application-specific way. And <emphasis>that</emphasis> is the
reason why a general monitor does not exist in programming languages
or runtimes.
</para>
<para>
Of course, not all applications need the full version of the monitor
idea, so there exist various levels of functional complexity in the
monitor concept. Not surprisingly, the more complex ones carry the
highest cost in undeterministic timing behaviour. The following
sections present monitors with increasing functional complexity.
</para>


<sect2 id="monitor-multi-crit">
<title>Multi-procedure critical section</title>
<para>
<indexterm><primary>multi-procedure monitor</primary></indexterm>
<indexterm><primary>multi-procedure critical section</primary>
</indexterm>
<indexterm>
 <primary>monitor</primary><secondary>multi-procedure</secondary>
</indexterm>
<indexterm>
 <primary>critical section</primary>
 <secondary>multi-procedure</secondary>
</indexterm>
This is the simplest monitor, and it delivers only the service of 
<emphasis>exclusive access</emphasis> to its procedures. That means
that only one client task can execute any of its procedures at the
same time. So, the procedures in the monitor don't have critical
sections inside, but, on the contrary, they are within the critical
section provided by the monitor wide
<parameter>monitor_mutex</parameter>. So, the simplistic
implementation above is sufficient, and this kind of monitor
<emphasis>can</emphasis> be offered as a parameterized runtime
primitive.
</para>
<para>
An example of such a monitor is the simplified version of the bank
account: <emphasis>each</emphasis> operation on the bank account
involves entering <emphasis>and</emphasis> leaving the monitor. So,
one misses the &ldquo;batch processing&rdquo; functionality of the
previously given example.
Another example is a <emphasis>command interpreter<indexterm>
<primary>command interpreter</primary></indexterm></emphasis>
of an embedded application that controls a machine: a client comes in
with a request for a machine operation; such an operation request is
typically translated in a sequence of multiple primitive actions on
that machine, so all primitive actions in the client request should be
executed before another client's request can be executed. However,
this other client's request can already be
<emphasis>interpreted</emphasis>, because that can happen outside
of the monitor; the monitor is needed for the
<emphasis>execution</emphasis> of the request, i.e., the unique and
serialized access to the machine.
</para>

</sect2>


<sect2 id="monitor-sem">
<title>Semaphore-based monitor</title>
<para>
<indexterm><primary>semaphore-based monitor</primary></indexterm>
<indexterm>
 <primary>monitor</primary><secondary>semaphore-based</secondary>
</indexterm>
The next level of monitor complexity comes when client tasks do have
application-dependent synchronization needs, but these needs can be
dealt with using <emphasis>binary semaphores</emphasis> only. This
means that the synchronization condition on which task blocks in the
monitor need not be checked explicitly: when the semaphore is
signaled, the condition is <emphasis>guaranteed</emphasis> to be true.
This kind of monitor is often called a <emphasis>Hoare<indexterm>
<primary>Hoare monitor</primary></indexterm>
<indexterm>
 <primary>monitor</primary><secondary>Hoare</secondary>
</indexterm></emphasis>
monitor, after C.A.R. Hoare, who first described this semantics,
<citation>Hoare74</citation>.
Another name is <emphasis>Mesa<indexterm>
<primary>Mesa monitor</primary></indexterm>
<indexterm>
 <primary>monitor</primary><secondary>Mesa</secondary>
</indexterm></emphasis>
monitor, after Xerox' graphical user interface language Mesa, in which
it was first used, <citation>LampsonRedell80</citation>.
The monitor has the so-called
<emphasis>Signal-and-sleep<indexterm>
<primary>signal-and-sleep</primary></indexterm>
<indexterm>
 <primary>monitor</primary><secondary>signal-and-sleep</secondary>
</indexterm></emphasis> semantics: the task that is in the monitor
signals the condition semaphore, goes to sleep itself, while the
signaled task runs immediately.
The Hoare monitor is the kind of monitor that every object
in &java; offers to the programmer, via the
<parameter>synchronized</parameter> access policy to its methods.
While its <emphasis>implementation</emphasis> is a bit more complex
(semaphores!) and time consuming (context switches!),
its semantics are much simpler: by context-switching immediately to
the signaled task, the monitor guarantees that this
task knows that the condition is satisfied, because no other
task in the monitor could have changed it. This semantics is only
possible if the signaling task can indeed sleep immediately, i.e.,
when at the moment of signaling, it can leave the data structure in a
consistent state. The waking and sleeping on the semaphore occurs
<emphasis>without</emphasis> freeing the
<parameter>monitor_mutex</parameter> mutex; so this synchronization is
between two tasks <emphasis>in</emphasis> the monitor; a task outside
of the monitor can only enter when all of the tasks that are already
in, are waiting, or there are no tasks in the monitor.
</para>
<para>
An example is the classical producer-consumer buffer problem: the
data structure in the monitor is a buffer, in which a producer task
writes data, and from which a consumer client retrieves data. The
semaphore is needed to signal and wait for the (binary!) condition
that the buffer is empty or full:
<programlisting>
<![CDATA[
monitor
{  // begin of monitor scope
const int BUFFER_CAPACITY = ...;
data buffer[BUFFER_CAPACITY];
data nextp, nextc;
int buffered_items = 0;
pthread_cond_t full = false;
pthread_cond_t empty = true;

produce_an_item()
{
  nextp = produce(...);
  if (buffered_items == BUFFER_CAPACITY) wait(full);
  // when going further here, there is certainly place in the buffer
  // and the consumer has set `buffered_items' to its correct value
  buffer[buffered_items++] = nextp;
  signal(empty); // wake up some task waiting to consume an item
 }

consume_an_item()
{
  if (buffered_items == 0) wait(empty);
  // when going further here, something is certainly in the buffer
  // and the producer has set `buffered_items' to its correct value
  nextc = buffer[--buffered_items];
  consume(nextc);
  signal(full); // wake up a producer 
}

} // end of monitor scope
]]>
</programlisting>
The checks for how many items are in the buffer take place
in the critical section protected by the monitor-wide mutex.
After the last signals in <function>produce_an_item()</function>
and <function>consume_an_item()</function>, the producer or consumer
task leaves the monitor, such that a new task can be allowed. This
uses the monitor-wide mutes, and is not visible in the code; it is
assumed to be done by the runtime.
</para>
<para>
This kind of monitor can also reasonably easy be offered as a
parameterized primitive of a generally useful service, such as
buffering.
</para>

</sect2>


<sect2 id="monitor-cond">
<title>Condition variable-based monitor</title>
<para>
<indexterm><primary>condition variable based monitor</primary></indexterm>
<indexterm>
 <primary>monitor</primary><secondary>condition variable based</secondary>
</indexterm>
The most complex monitor allows its procedures to have synchronization
needs that can only be dealt with using composite boolean expressions,
such that condition variables are required.
This kind of monitor is often called a
<emphasis>Hansen<indexterm>
<primary>Hansen monitor</primary></indexterm>
<indexterm>
 <primary>monitor</primary><secondary>Hansen</secondary>
</indexterm></emphasis>
monitor, after Per Brinch Hansen, who first described its semantics,
<citation>BrinchHansen73</citation>.
The semantics of the signaling is now
<emphasis>Signal-and-continue<indexterm>
<primary>signal-and-continue</primary></indexterm>
<indexterm>
 <primary>monitor</primary><secondary>signal-and-continue</secondary>
</indexterm></emphasis>: the task that is in the monitor
and raises the signal continues, and the signaled task is put in a wait
queue (within the monitor!). So, this task is not guaranteed of finding
the condition fulfilled when it gets a chance to run again, and it
should check that condition again. That's the reason for the
<function>while{}</function> loop in the code:
<programlisting>
<![CDATA[
monitor
{  // begin of monitor scope
const int BUFFER_CAPACITY = ...;
data buffer[BUFFER_CAPACITY];
data nextp, nextc;
int buffered_items = 0;
pthread_cond_t full = false;
pthread_cond_t empty = true;

produce_an_item()
{
  nextp = produce();
  while (buffered_items == BUFFER_CAPACITY)
    { wait(full); }
    // the condition is _checked_, not just signaled
  buffer[buffered_items++] = nextp;
  signal(empty); // wake up someone waiting for an item
 }

consume_an_item()
{
  while (buffered_items == 0) wait(empty);
     // at this point, I m guaranteed to get an item
  nextc = buffer[--buffered_items];
  consume(nextc);
  signal(full); // wake up a producer 
}
} //end of monitor scope
]]>
</programlisting>
An example is a resource allocation system, such as the
producer-consumer buffer above: the shared data structure
is the resource, and the (de)allocation procedures check a lot of
conditions before each client can get or release (part of) the
resource.
</para>
<para>
This kind of monitor is very difficult to offer as a general
parameterized primitive.
</para>
<para>
(TODO: give full code example. E.g. coordinating readers and writers
example of <citation>Nutt2000</citation>, p. 202, but with more
complex conditions than the binary semaphores.)
</para>

</sect2>

</sect1>


<sect1 id="sect-PC-mediator">
<title>Producer-Consumer</title>
<para>
<indexterm><primary>Producer-Consumer</primary></indexterm>
<xref linkend="sect-mediator"> introduced the general concept of a
<emphasis>mediator</emphasis>; <xref linkend="sect-monitor"> explained
how the <emphasis>monitor mediator</emphasis> works.
And this Section applies the pattern to
the very often used <emphasis>Producer-Consumer</emphasis> &ipc;
between two tasks. The Producer-Consumer mediator is the object (data
and methods) that helps task A to send data to task B, without having
(i) to know anything about task B, and
(ii) to worry about the implementation details of getting the data
from A to B. Again, this <emphasis>loose coupling<indexterm>
<primary>loose coupling</primary></indexterm></emphasis>
allows for easier maintenance and updates. For example, if task B is
moved to another process or processor, the mediator can choose,
internally, for a more appropriate type of communication and
buffering, and neither A nor B have to be changed.
</para>


<sect2 id="prod-cons-terminology">
<title>Terminology</title>
<para>
This Section uses the following terminology:
<itemizedlist>

<listitem>
<para>
The <emphasis>producer</emphasis> is the task that wants to send the
data.
</para>
</listitem>

<listitem>
<para>
The <emphasis>consumer</emphasis> is the task that wants to receive
the data.
</para>
</listitem>

<listitem>
<para>
The <emphasis>mediator</emphasis> is the task (active) or object (passive)
that producer and consumer use to perform their communication, without
having to know each other.
</para>
</listitem>

<listitem>
<para>
The data is also called the <emphasis>message</emphasis>.
</para>
</listitem>

<listitem>
<para>
The asymmetry suggested by the terminology &ldquo;producer&rdquo; and
&ldquo;consumer&rdquo; is not really relevant in the mediator pattern.
So, both producer and consumer are called <emphasis>clients</emphasis>
of the mediator.
</para>
</listitem>

<listitem>
<para>
The mediator can be <emphasis>persistent<indexterm>
 <primary>persistent</primary><secondary>mediator</secondary>
</indexterm></emphasis>,
i.e., it is created once at start-up, and handles all requests during
the lifetime of the interaction between both clients.
</para>
</listitem>

<listitem>
<para>
The mediator can be <emphasis>transient<indexterm> 
 <primary>transient</primary><secondary>mediator</secondary>
</indexterm></emphasis>,
i.e., it is created each time a client issues a new request, and
deleted as soon as the request has been handled.
</para>
</listitem>

</itemizedlist>
</para>

</sect2>


<sect2 id="prod-cons-data-handling">
<title>Handling</title>
<para>
<indexterm>
 <primary>handling</primary><secondary>mediator</secondary>
</indexterm>
<indexterm>
 <primary>mediator</primary><secondary>handling</secondary>
</indexterm>
Every line of code in a program is executed by one particular task
(possibly the kernel).  One says that the code &ldquo;runs in
the task's &ldquo;<emphasis>context</emphasis>&rdquo;, using its
stack, program counter, etc. In the method calls on the mediator object,
it is not always clear or predictable which parts are executed in
which context. For both unprotected and protected objects, everything
that happens &ldquo;in&rdquo; the mediator is in fact executed using
the stack and the context of one of the clients. One discriminates
between the <emphasis>synchronous</emphasis> and
<emphasis>asynchronous</emphasis> parts of every call of a mediator
method:
<itemizedlist>

<listitem>
<para>

<emphasis>Synchronous.<indexterm>
<primary>synchronous</primary></indexterm></emphasis>
<indexterm>
 <primary>execution</primary><secondary>synchronous</secondary>
</indexterm>
This is the part that executes <emphasis>most definitely</emphasis> in
the context of the calling client. &ldquo;Synchronous&rdquo; here
means: &ldquo;activated by code in the method call&rdquo; that the
client performs.

<emphasis>Asynchronous.<indexterm>
<primary>asynchronous</primary></indexterm></emphasis>
<indexterm>
 <primary>execution</primary><secondary>asynchronous</secondary>
</indexterm>
This part does <emphasis>not necessarily</emphasis> run in the context
of the calling client (but that remains possible), because it is
executed &ldquo;asynchronously&rdquo;. That means, it is not directly
activated by code in the client call, but by other methods of the
mediator. These other method calls can, for example, be activated by
the other client of the mediator. A typical example: the synchronous
part gets the data from the producer to the mediator protected object,
where it stays until the consumer asks for it later on
(&ldquo;asynchronously&rdquo;).

</para>
</listitem>

</itemizedlist>
Every client call involves, in general, three distinct
handlers (or &ldquo;services&rdquo;) by the mediator: synchronous,
asynchronous, and completion handling:
<itemizedlist>

<listitem>
<para>
<emphasis>Synchronous handling</emphasis>
<indexterm>
 <primary>handling</primary><secondary>synchronous</secondary>
</indexterm>
is that part of the interaction that is done in the client's method
call: the client changes some mediator data structure that remembers
that this call has taken place and that it needs further
handling (in other words, it makes the producer-consumer data exchange
&ldquo;pending&rdquo;), and possibly also copies the data needed for
this further handling.
In general, this synchronous part involves some locks on protected
data structures, and hence possibly blocks the calling thread.
</para>
</listitem>

<listitem>
<para>
<emphasis>Asynchronous handling.</emphasis>
<indexterm>
 <primary>handling</primary><secondary>asynchronous</secondary>
</indexterm>
The mediator usually has to do more work than the message copying and
bookkeeping in the synchronous part: the message must effectively be
delivered to a consumer; the buffers must be updated according to
incoming priority and cancellation request; an event that has fired
has to be serviced; etc.  How exactly the further handling is done
depends on the type of the mediator:
 <itemizedlist>

 <listitem>
 <para>
<emphasis>The mediator is a passive object (unprotected or protected
object).</emphasis> In this case, one of the interaction initiating client
tasks executes <emphasis>all</emphasis> the asynchronous handling
that is pending in the mediator. Not only its own handling, but that
of all pending requests. So, for this client, there is no real
distinction between the synchronous and asynchronous handling parts.
 </para>
 </listitem>

 <listitem>
 <para>
<emphasis>The mediator is a task (active object, or component).</emphasis>
The &ipc; initiating client continues after the synchronous handle
finishes, i.e., it has put all data for further handling in an
appropriate buffer, and the mediator further processes this data later
on, in its own context.
 </para>
 </listitem>

 </itemizedlist>
</para>
</listitem>

<listitem>
<para>
<emphasis>Completion handling</emphasis><indexterm>
<primary>completion handling</primary></indexterm>
<indexterm>
 <primary>handling</primary><secondary>completion</secondary>
</indexterm>
is an optional third handling part that clients can ask for. For
example: a producer sends new data to a consumer only as soon as it
knows that the consumer is ready to accept the new data. Or, the
producer blocks until the mediator has completed the interaction,
after which the producer is woken up by the mediator.
</para>
<para>
Completion is performed <emphasis>after</emphasis> the asynchronous
handling, and in the context of the task that just finished executing
the last asynchronous handling.  Completion could involve the mediator
accessing data structures of the client that has registered the
completion. Hence, completion requires more careful coding from the
application programmer, because it runs asynchronously and hence the
context is probably not the one of the registering client. An
additional concern for the application programmer is that it is not
always straightforward to guarantee that, at the time of completion,
the called object still exists. In summary, treat a completion handler
as if it were an interrupt handler: it should never do something that
can block.
</para>
</listitem>

</itemizedlist>
The client (or every task authorized to do it for the client) has to
register <emphasis>explicitly</emphasis> a completion
function (or &ldquo;call-back function<indexterm>
<primary>call-back</primary></indexterm>&rdquo;)
with the mediator. Synchronous and asynchronous handlings must not be
registered in the case of a Producer-Consumer mediator,
because they are the default send and receive methods of the
Producer-Consumer mediator.  (However, other types of mediators could
require explicit registering of client-specific functions for
<emphasis>all</emphasis> handlers.) The mediator calls the
registered completion function at the appropriate moment, i.e., after
all asynchronous handling has been done.
</para>
<para>
Note that (i) asynchronous and completion handling need not be present
in every mediator; (ii) asynchronous handling must always come after the
synchronous handling; and (iii) completion must always come after
asynchronous handling.
</para>

</sect2>


<sect2 id="prod-cons-data-buffering">
<title>Data buffering</title>
<para>
The mediator can buffer the protected message data in various ways, as
explained in <xref linkend="ipc-dataexchange">.
</para>

</sect2>


<sect2 id="prod-cons-data-access">
<title>Data access</title>
<para>
Accessing the data of the message object in the mediator can happen in
various ways, each with a different trade-off between blocking and
data protection:
<itemizedlist>

<listitem>
<para>
<emphasis>Unprotected object.</emphasis> The data of the message is
directly accessed in the &ipc; function calls, i.e., it is
<emphasis>shared memory</emphasis> in the form of a global variable.
This allows for the most efficient data access, with the shortest
amount of blocking, but it is only a viable &ipc; mechanism if producer
and consumer are <emphasis>guaranteed</emphasis> not to access the data
simultaneously.  Indeed, in general, these &ipc; calls on an unprotected
mediator object are <emphasis>not thread safe</emphasis> because the
object contains data that is shared between consumer and producer.
Hence, this approach is only viable in fully deterministic cases, such
as an &isr;-&dsr; combination.
</para>
</listitem>

<listitem>
<para>
<emphasis>Protected object</emphasis> (Also called ``monitor'' in some
literature.) The data of the message is not directly accessible to
producer and consumer. They must use access method calls (``read'' and
``write''), which are <emphasis>serialized</emphasis> within the mediator
by some kind of mutual exclusion lock (mutex, semaphore, \dots) around
the data.

This allows (but does not automatically guarantee!) safe access to the
message data, but producer and consumer can block on the lock.
</para>
</listitem>

<listitem>
<para>
<emphasis>Active object.</emphasis>
This is conceptually the same as a protected object, but with one
important difference: the mediator has its own thread.
</para>
</listitem>


<listitem>
<para>
<emphasis>Component.</emphasis> The protected or active
objects are a good solution in a system in which producers and
consumers know which objects to use.  Modern software systems become
more dynamic and more distributed, and having to know the identity of
all services in the system becomes a scaling bottle-neck. Therefore,
the concept of components has been introduced: they have a protected
object inside, but have extra functionality to work together with a
<emphasis>name server</emphasis> that offers run-time and
network-transparant bindings between components.
</para>
</listitem>

</itemizedlist>
</para>

</sect2>

</sect1>


<sect1 id="sect-events">
<title>Events</title>

<para>
<indexterm><primary>event</primary></indexterm>
The event pattern describes how to
synchronize activities running in different task, with very loose
coupling between the tasks.
The event pattern is
applicable to the &rtos; primitives <emphasis>signal<indexterm>
<primary>signal</primary></indexterm></emphasis> 
(<xref linkend="sect-signal">) and
<emphasis>interrupt<indexterm>
<primary>interrupt</primary></indexterm></emphasis>
(<xref linkend="sect-inter-sw">),
with only minor adaptations:
<itemizedlist>

<listitem>
<para>
Tasks must <emphasis>register</emphasis> to get notified about events
and interrupts, while (in the &posix; semantics, at least) they have
to explicitly <emphasis>de-register</emphasis> from every signal they
don't want to receive.
</para>
</listitem>

<listitem>
<para>
The synchronous handling of interrupts is initiated by the
<emphasis>hardware</emphasis>, and not by another software task.
</para>
</listitem>

<listitem>
<para>
Events can carry any form of data, while signals, and interrupts are
data-less triggers. Or, almost data-less: they can carry information
about, for example, the time or the cause of the triggering.
</para>
</listitem>

</itemizedlist>
</para>

<sect2 id="event-semantics">
<title>Semantics</title>
<para>
The event pattern has a lot in common with the Producer-Consumer
pattern, but its emphasis is on
<emphasis>synchronization of the tasks' activities</emphasis> and not on
<emphasis>data exchange</emphasis>. In any case, this discussion
re-uses as much material of the Producer-Consumer pattern as possible.
The semantics of a general event are as follows:
<itemizedlist>

<listitem>
<para>
<emphasis>(De-)Registration</emphasis> of a
<emphasis>listener<indexterm>
<primary>listener</primary></indexterm></emphasis>
<indexterm>
 <primary>event</primary><secondary>listener</secondary>
</indexterm>
function.  Registration is a <emphasis>configuration time</emphasis>
activity, and is not part of the event's interaction itself.
At registration, a task gives (a reference to) a function call (the
&ldquo;listener&rdquo;) which must be called as
<emphasis>synchronous or asynchronous handler</emphasis> whent the event 
<emphasis>fires</emphasis>.
<indexterm>
 <primary>firing</primary><secondary>event</secondary>
</indexterm>
Whether the listener is synchronous or asynchronous is again
a configuration option. When called, the listener gets information
about which event has caused it to run.
</para>
<para>
Multiple processes can register their listeners with the same event.
And the same listener can be registered with several events.
</para>
</listitem>

<listitem>
<para>
<emphasis>(De-)Registration</emphasis> of a
<emphasis>completion<indexterm>
<primary>completion</primary></indexterm></emphasis>
<indexterm>
 <primary>event</primary><secondary>completion</secondary>
</indexterm>
function. This is technically similar to listener registration, but
functionally different: the completor is called by the event mediator
when all synchronous and asynchronous activities for this event have
finished. 
The task that registers a completion function need not be the one that
registers the listener. And a task doesn't have to register in order
to be allowed to fire an event.
</para>
</listitem>

<listitem>
<para>
<emphasis>Firing.<indexterm>
<primary>firing</primary></indexterm></emphasis>
<indexterm>
 <primary>event</primary><secondary>firing</secondary>
</indexterm>
</para>
</listitem>

<listitem>
<para>
<emphasis>Firing.<indexterm>
<primary>firing</primary></indexterm></emphasis>
<indexterm>
 <primary>event</primary><secondary>firing</secondary>
</indexterm>
A task fires the event, i.e., it executes a method call of the
mediator, that performs the <emphasis>synchronous handling</emphasis>
of the event, and puts the asynchronous and completion handling in the
pending queue for the fired event. (All this requires synchronized
access to the mediator's bookkeeping data structures.)
</para>
</listitem>

<listitem>
<para>
<emphasis>Guard.<indexterm>
<primary>guard</primary></indexterm></emphasis>
<indexterm>
 <primary>event</primary><secondary>Guard</secondary>
</indexterm>
Whether a firing event really sets the handling in acion or not, can
be made dependent on a <emphasis>guard</emphasis>. This can be any
Boolean expression, which prevents the firing if its evaluation
returns &ldquo;false&rdquo;. The evaluation happens instantaneously,
at the fire time of the event, i.e., in the synchronous handling by
the mediator.
</para>
<para>
Be careful with guards: they are a too powerful mechanism for
scalable and deterministic software systems. Having a Boolean decide
about whether or not to do something is often a sign of a bad design:
it's much cleaner to have this &ldquo;state dependency&rdquo; inside
the mediator, by providing it with the
<emphasis>State Machine</emphasis> mechanism
(<xref linkend="sect-fsm">).
</para>
</listitem>

<listitem>
<para>
<emphasis>Handling.<indexterm>
<primary>handling</primary></indexterm></emphasis>
<indexterm>
 <primary>event</primary><secondary>handling</secondary>
</indexterm>
This covers the asynchronous and completion parts of the event. So,
handling calls the registered listeners and/or completion functions.
</para>
</listitem>

</itemizedlist>
</para>

</sect2>


<sect2 id="event-policy">
<title>Policies</title>
<para>
Events can have multiple <emphasis>policies</emphasis> on top of the
above-mentioned mechanism. All of the following ones can be combined:
<itemizedlist>

<listitem>
<para>
<emphasis>Queue pending events.</emphasis> The event mediator can have
a queue for each listener, in which it drops every fired event, at
synchronous handling. So, no events are lost when a new one arrives,
while the synchronous processing is still busy with a previous event.
</para>
</listitem>
<listitem>
<para>
<emphasis>Prioritize listeners and completors.</emphasis> This allows to
influence the order in which they are executed.
</para>
</listitem>
<listitem>
<para>
<emphasis>One-off execution of listeners and completors.</emphasis>
This means that on each fired event, only one of the registered
listeners and completors is executed.
</para>
</listitem>
</itemizedlist>
</para>

</sect2>



<sect2 id="event-composite">
<title>Composite events</title>
<para>
Often, a task wants to be notified not just when one particular
event has been fired, but whenever a logical &ldquo;AND&rdquo; or
&ldquo;OR&rdquo; combination of several events has occurred.  Such a
composite event <parameter>C</parameter> could be implemented by a
mediator event between the process <parameter>P</parameter> and the
two events <parameter>A</parameter> and <parameter>B</parameter>.
For example, for the AND composite event:
<itemizedlist>

<listitem>
<para>
The task <parameter>P</parameter> registers its listener and
completion function with <parameter>C</parameter>.
</para>
</listitem>

<listitem>
<para>
<parameter>C</parameter> registers listeners for both
<parameter>A</parameter> and <parameter>B</parameter> but registers no
completion handler.
</para>
</listitem>

<listitem>
<para>
The listeners for <parameter>A</parameter> and
<parameter>B</parameter> look like this:
<programlisting>
<![CDATA[
if (A_has_fired and B_has_fired)
   clear A_has_fired
   clear B_has_fired
   fire C
]]>
</programlisting>
This code runs internally in the mediator of <parameter>C</parameter>
hence it can access the flags <parameter>A_has_fired</parameter> and
<parameter>B_has_fired</parameter> atomically, without needing the
overhead of a critical section.
</para>
</listitem>

</itemizedlist>
Of course, any Boolean expression of events can be implemented as a
composite event. Whether or not to provide a separate composite event
for a specific Boolean expression is an efficiency trade-off: writing
a new object, versus introducing multiple levels of the simple AND and
OR composite events.
</para>

</sect2>


<sect2 id="event-loop-exit">
<title>Loop with asynchronous exit event</title>
<para>
A task <parameter>P</parameter> cyclically runs a function
<parameter>F</parameter> which it must execute whenever event
<parameter>A</parameter> occurs. But whenever event
<parameter>B</parameter> occurs, the task should exit the loop around
<parameter>F</parameter>. <parameter>B</parameter> can be asynchronous
to the execution of the loop, so it's best to let
<parameter>P</parameter> finish the loop, and only then exit.  This
can be done by having the process wait for the
&ldquo;OR&rdquo; of both events, and then take appropriate action
(loop or exit).  Here follows a possible implementation, where the
composite event signals a condition variable:
<programlisting>
<![CDATA[
Task P:                        Composite event listener:

while (1) {                    if (A_has_fired or B_has_fired)
  wait_on_condition(A_OR_B);      broadcast(A_OR_B);
  if (A) F;
  if (B) exit;
  }
]]>
</programlisting>
The exit is done &ldquo;synchronously&rdquo;, i.e., it never
interrupts the loop function <parameter>F</parameter>. As a result,
the process comes out of the loop in a predictable state.
</para>

</sect2>


<sect2 id="event-caution">
<title>Some caution</title>
<para>
Let's conclude this Section with a critical note by Per Brinch Hansen
on event variables <citation>BrinchHansen73</citation>:
&ldquo;Event operations force the programmer to be aware of the
relative speeds of the sending and receiving processes.&rdquo; And: 
&ldquo;We must therefore conclude that event variables of the previous type
are impractical for system design. The effect of an interaction
between two processes must be independent of the speed at which it is
carried out.&rdquo; He was talking about using events as a
<emphasis>general</emphasis> multi-tasking synchronization primitive,
replacing the synchronizations of <xref linkend="ipc-synch">. And in
that context, his remarks are very valid (and he suggested
<emphasis>condition variables</emphasis>,
<xref linkend="sect-condvar">). But there are situations
where the synchronization is <emphasis>not</emphasis> time-dependent;
for example, the feedback control example in
<xref linkend="chap-control">.
</para>

</sect2>

</sect1>


<sect1 id="sect-fsm">
<title>State Machines</title>

<para><indexterm>
<primary>state machine</primary></indexterm>
A state machine is a common way to give structure to the
execution of computer tasks: a task can be in a number of possible
<emphasis>states</emphasis>, performing a
<emphasis>particular function</emphasis> in each of these states, and
making a transition to another state caused by either an external
event or internal state logic.  So, a state machine is the appropriate
pattern for an application in which different modes of control are to
be available, and the transitions between these mode is triggered by
events.
</para>
<para>
This Section discusses <emphasis>object</emphasis> state machines: the
state machine doesn't describe the classical
<emphasis>process</emphasis> of actions triggered by events, but it
allows an <emphasis>object to change its behaviour</emphasis> through
events. The software engineering
advantage of the object-based approach to state machines is that the
internals of the states need not be exported outside of the object.
This Section describes the mechanism of one particular type of state
machine, where the design goal is to maximize determinism and semantic
unambiguity, at the cost of ultimate generality.  The execution of the
above-mentioned state functionality requires, in general,
<emphasis>finite amounts of time</emphasis>, while the mathematical
state machine reacts in zero time. No software approach can guarantee
such zero-time execution, but the presented object state machine does
guarantee that all state functions are executed atomically within the
context of the state machine, i.e., state functions are properly
serialized with state transitions.
</para>
<para>
(TODO: code examples; Hierarchical State Machines
<ulink
url="http://www.eventhelix.com/RealtimeMantra/HierarchicalStateMachine.htm">
eventhelix example
</ulink>?;)
</para>


<sect2 id="fsm-semantics">
<title>Semantics</title>
<para>
An object state machine is a composite class that manages the
following data:
<itemizedlist>

<listitem>
<para>
One <emphasis>class</emphasis> for each state in the state machine. It
contains the functions to be called in the state, as discussed below.
</para>
</listitem>

<listitem>
<para>
 <emphasis>Events</emphasis> to make the object
<emphasis>transition</emphasis> to other states.
</para>
</listitem>

<listitem>
<para>
A <emphasis>graph</emphasis>, to represent the structure of the state
machine: a node is a state class, and an edge is a transition between
states.
</para>
</listitem>

</itemizedlist>
The choice for a graph object corresponds to the choice of a
<emphasis>persistent</emphasis> state machine mediator: the graph
persistently stores the information of transitions and related events,
such that this information is directly available and no time is lost
creating or deleting state objects. This is an example of the
classical trade-off between computation cost and storage cost of
performing the same functionality.

<figure id="fig-state-functions" float="1" pgwide="0">
<title>
General structure of a state.
</title>
<mediaobject>
<imageobject>
<imagedata fileref="rthowtofigs/state-functions.png" format="PNG">
</imageobject>
<imageobject>
<imagedata fileref="rthowtofigs/state-functions.eps" format="EPS">
</imageobject>
</mediaobject>
</figure>

Figure <xref linkend="fig-state-functions"> shows the general
structure of a state:
<itemizedlist>

<listitem>
<para>
<emphasis>Entry.</emphasis> This function runs when the object first
enters a state. If a state is implemented as a transient object, this
would be the state object's constructor. The last thing the entry
function does is to call the state function.
</para>
</listitem>

<listitem>
<para>
<emphasis>State function.</emphasis> The state object runs this
function after the entry. The state function is guaranteed to be
executed <emphasis>atomically</emphasis> (i.e., without interruption)
<emphasis>within the context of the object's state machine</emphasis>.
That is, no state transitions can happen when the function is running.
</para>
<para>
The state function can be an <emphasis>action</emphasis> or an
<emphasis>activity</emphasis>:
 <itemizedlist>

 <listitem>
 <para>
 <emphasis>Action.</emphasis> The state function runs once, performs a
certain ``action'' (such as setting an output), and then runs the
exit function (see below).
 </para>
 </listitem>

 <listitem>
 <para>
 <emphasis>Activity.</emphasis> The state function runs in a loop, from which
it exits when it receives the ``abort'' event, or when it decides
itself to exit from its loop.
 </para>
 </listitem>

 </itemizedlist>
</para>
</listitem>

<listitem>
<para>
<emphasis>Exit.</emphasis> This function runs when the object is about
to transition to another state. For a transient object, it would be
the object's destructor.
</para>
<para>
The exit function calls the state machine object (with as parameters
the current state and the event that has caused the transition) to
load the next state information in the state object, and (optionally)
fires an event that signals the state exit. Loading the next state
means that new entry, state and exit functions are filled in in the
corresponding data structure of the state machine object.
</para>
<para>
When the next state is equal to the current state, the object goes
directly to the state function, without executing the entry function
again.
</para>
</listitem>

</itemizedlist>
This mechanism does not represent the most general form of state
machine: the atomicity of the state function (action as well as
activity) is a restriction on the generality, but this serialization
of state function execution and state transitioning adds a lot to the
<emphasis>determinism</emphasis> of the state machine.
If the state function is not guaranteed to run until completion, the
object could end up with unpredictable and inconsistent values of some
variables. The atomicity is only guaranteed within the context of the
running task, and not within the whole software system.
</para>
<para>
The presented mechanism can represent both
<emphasis>Moore<indexterm>
<primary>Moore state machine</primary></indexterm></emphasis>
<indexterm>
 <primary>state machine</primary><secondary>Moore</secondary>
</indexterm>
and <emphasis>Mealy<indexterm>
<primary>Mealy state machine</primary></indexterm></emphasis>
<indexterm>
 <primary>state machine</primary><secondary>Mealy</secondary>
</indexterm>
state machines. If the state function is an activity, the state
machine is a Moore machine. Action
(or &ldquo;(discrete) change&rdquo;) is associated to a transition,
and in that case, the state machine is a Mealy state machine,
<citation>Mealy55</citation>.  So, Moore machines are appropriate for
continuous, non zero-time activity (such as software objects), and
Mealy machines for discrete changes (such as electronic circuits).
</para>

</sect2>

<sect2 id="fsm-events">
<title>Implementation with events</title>
<para>
The execution of a state machine can be implemented on top of the
event mechanism of <xref linkend="sect-events">. The state
machine object has an event object for each of its transition events,
and it has a data structure that stores the entry, state and exit
functions of the currently active state. The event that causes a state
transition has been initialized as follows: 
<itemizedlist>

<listitem>
<para>
Its listener executes the current state's exit function.
</para>
</listitem>

<listitem>
<para>
Its completer executes the new state's entry function (unless
the new state is the same as the old state), as well as its state
function.
</para>
</listitem>

</itemizedlist>
The listener and completer select the right functions from the
information in the state machine graph, and from the identity of the
current state.
</para>
<para>
In principle, both actions (i.e., exit and entry functions) could be
done by the listener. But if the event causes more things than just
a state transition in a state object, it could be interesting to have
all this event's listeners executed before the completer executes any
of the state entry functions.
</para>
<para>
The above-mentioned run-time registration of listeners and completer
is not always a good idea, because registration involves a lot of
linked list operations. An alternative is to have the state machine
listen to all events, and let its listener call the corresponding
state listeners and completers.
</para>
<para>
The advantage of using events to trigger transitions is, that the
knowledge of to which next state to transition at exit, is not stored
in the current state, but in the state machine object's graph
structure.  In this sense, that state machine object is a mediator
between the different states.
</para>

</sect2>

</sect1>


<sect1 id="sect-execeng">
<title>Execution Engine</title>

<para>
(TODO: <emphasis>sequencing</emphasis> of non-distributed but
non-linear activities;)
</para>

<para>
The <emphasis>Execution Engine<indexterm>
<primary>Execution Engine</primary></indexterm></emphasis> is a
pattern that takes care of <emphasis>activation</emphasis> and
<emphasis>configuration</emphasis> of software components:
<itemizedlist>

<listitem>
<para>
Activating components respecting their individual timing
specifications.
</para>
</listitem>

<listitem>
<para>
Run-time configuration of components.
</para>
</listitem>

</itemizedlist>
The Execution Engine is a mediator object
(<xref linkend="sect-mediator">)
in the sense that it decouples the activation and configuration
synchronization of several components. Unlike previously discussed
mediators, it doesn't take care about any
<emphasis>data exchange</emphasis> or
<emphasis>mutual exclusion synchronization</emphasis> between the
components.
</para>
<para>
By localizing the <emphasis>activation logic</emphasis> of a complete
application in one single mediator, the system is much easier to
understand, program, adapt, and make deterministic. The core of the
Execution Engine can be a <emphasis>finite state machine</emphasis>,
whose outputs are triggers for the other components; for the pacing of
its state machine, the Execution Engine relies on basic timer
functionality of the &rtos; on which it is implemented.
</para>

</sect1>


<sect1 id="pattern-ipc">
<title>Distributed IPC</title>
<para>
The important domain 
of <emphasis>component distribution and communication</emphasis>
has already been developed quite extensively.
Douglas C. Schmidt's free software projects
<ulink url="http://www.cs.wustl.edu/~schmidt/ACE.html">ACE</ulink>
(<emphasis>Adaptive Communication Environment</emphasis>)
and
<ulink url="http://www.cs.wustl.edu/~schmidt/TAO.html">TAO</ulink>
(<emphasis>The ACE Orb</emphasis>) are primary references. This work
has been an important basis for the specification of 
<emphasis>Real-Time &corba;</emphasis>,<indexterm>
<primary>Real-Time &corba;</primary></indexterm>
<indexterm>
 <primary>&corba;</primary><secondary>Real-Time</secondary>
</indexterm>
<xref linkend="standards-rtcorba">.
</para>
<para>
There is sufficient documentation and code available on-line, so this
text will not go into more detail. Especially because distributed
&ipc; is inherently not a hard real-time system.
</para>
<para>
(TODO: more details; example with real-time or embedded relevance:
<emphasis>DAIS</emphasis> (Data Acquisition from Industrial Systems
Specification, OMG group effort for large-scale data acquisition);)
</para>

</sect1>


<sect1 id="pattern-transaction">
<title>Transactions</title>
<para>
Important concept 
in the context of databases:
<ulink
 url="http://www.cis.temple.edu/~ingargio/old/cis307s01/readings/transaction.html"><emphasis>ACID<indexterm>
<primary>ACID</primary></indexterm></emphasis></ulink>
(Atomicity, Consistency, Isolation/Serializability, Durability).
In that form, it is too heavy for real-time systems, which often
interact with a real world, in which it can impossibly undo actions.
But an more realistic sub-primitive is the
<emphasis>Two-Phase Commit<indexterm>
<primary>Two-Phase Commit</primary></indexterm></emphasis> (TPC)
pattern for atomic transactions in a Distributed System,
<citation>Gray78</citation>,
<citation>Galli2000</citation>,
<citation>BurnsWellings2001</citation>, p.390: two tasks want
to be sure that both of them agree on a particular action being done,
and the TPC guarantees that the action is completely done or
completely undone. The first of the two phases is the
<emphasis>negotation and set-up phase</emphasis>, and the second phase
is the
<emphasis>execution phase (&ldquo;commit&rdquo;)</emphasis>.
</para>
<para>
(TODO: more details; example with real-time relevance;)
</para>

</sect1>

</chapter>


<chapter id="chap-control">
<title>Design example: &ldquo;control&rdquo;</title>
<para>
&ldquo;Control&rdquo; is a very mature and broad domain, with
thousands of research publications every year. Most of these
publications deal with new <emphasis>applications</emphasis> of
existing concepts and technology, or with improved
<emphasis>functionality</emphasis> of existing aproaches. There is
almost no evolution anymore in the <emphasis>fundamentals</emphasis>
of the technology. But this lack of evolution is not perceived as a
problem, because the fundamentals are mature and have proven to work.
This means control is an exquisite subject to define Software Patterns
for, <xref linkend="chap-patterns">.
</para>
<para>
This Chapter describes these Patterns, as far as they are relevant
for the <emphasis>real-time</emphasis> software engineering aspects of
the problem. It presents (a design for) a generic, hard real-time
control framework, making use of the decoupling ideas and other 
Software Patterns introduced in the previous Chapters. A similar
discussion could be held for other mature application areas, such as
telecommunication.
</para>
<para>
(TODO: are there other hard real-time areas besides control and
telecom? Is telecom really hard real-time? Or is its hard real-time
functionality only the signal processing, which we take as part of the
generic control pattern?)
</para>
<para>
The first message of this Chapter is that many complex hard real-time
systems can be built using only an amazingly small set of the
primitives offered by a typical &rtos;. (The design presented in this
Chapter can even run <emphasis>without</emphasis> operating
system.) This fact often comes as a surprise to students or newcomers
in the field, because they tend to come up with systems that have
separate tasks for every piece of functionality in the system, and
that need complex &ipc;, driven as they are by their eagerness to use
the largest possible set of the &rtos; primitives they've
learned in the classroom. So, also in real-time and embedded
application programming, simplicity of design is the signature of the
real craftsman.
</para>
<para>
The second message is inspired by the observation that experienced
designers in <emphasis>every</emphasis> particular application domain
introduce a lot of <emphasis>structure</emphasis> in the way
they solve the application problems. They do this most often
<emphasis>implicitly</emphasis>. So, the message is to first make
explicit the largest form of structure that is generic for the
application domain, and then use it to build the
<emphasis>infrastructure</emphasis> parts in your design.
The rule of thumb is that structure <emphasis>always</emphasis> leads
to efficiency gains, in design, in implementation, and in
documentation.
</para>
<para>
The third message is to document and localize the
<emphasis>&ldquo;hot spots&rdquo;</emphasis><indexterm>
<primary>hot spot</primary></indexterm>
in your design. That is, those parts that will have to be changed
whenever the application is ported to new hardware and a new operating
system. This Chapter calls them the
<emphasis>device interface</emphasis> and the
<emphasis>operating system interface</emphasis>.
</para>

<sect1 id="sect-control-what">
<title>What is control?</title>
<para>
This Chapter uses &ldquo;control&rdquo; to illustrate the
above-mentioned messages. It interprets the concept of control quite
broadly: the presented framework covers various domains, known under
names such as:
<itemizedlist>

<listitem>
<para>
 <emphasis>Pure data acquisition,</emphasis> as implemented by 
&comedi; (<xref linkend="sec-dev-comedi">).
</para>
</listitem>

<listitem>
<para>
<emphasis>Extended data acquisition and generation,</emphasis> with
pre- and post-processing of the signals. For example, applications
which must calibrate equipment against standards, send specific pulse
trains, detect peaks and abrupt changes, etc.
The presented design can be seen as an extension to &comedi;, adding
signal processing and feedback control funtionality.
</para>
</listitem>

<listitem>
<para>
<emphasis>Waveform generation</emphasis>, which is the same as the
generation application above.
</para>
</listitem>

<listitem>
<para>
<emphasis>Programmable Logic Control (PLC)</emphasis>, i.e., the
&ldquo;primitive&rdquo; form of control in which all inputs are read
first and stored in a buffer, then a set of (possibly unrelated)
functions are run in sequence, each producing one or more values in
the output buffer, which is finally written to the peripherals
&ldquo;in block.&rdquo; PLC functionality is present in most machine
tools, to manage discrete actions, such as closing valves, setting
LEDs, and even simple control loops such as
<emphasis>PID</emphasis><indexterm>
<primary>PID</primary></indexterm>
(Proportional-Integrative-Derivative).
</para>
</listitem>

<listitem>
<para>
 <emphasis>Feedforward/feedback control,</emphasis> such as in
robotics or other mechatronic systems.
</para>
</listitem>

<listitem>
<para>
<emphasis>Observation,</emphasis> i.e., measured
signals are monitored, and specific patterns in the signals are
detected and reacted to.
</para>
</listitem>

<listitem>
<para>
<emphasis>Estimation,</emphasis> i.e., the measured raw
signals are processed, and estimates of non directly measurable
quantities are derived from them. Observation and estimation are often
used as synonyms; this text will do that too.
</para>
</listitem>

<listitem>
<para>
<emphasis>Signal processing</emphasis>: still another name for all 
the applications mentioned above (i.e., those that don't drive outputs
based on measured inputs).
</para>
</listitem>

</itemizedlist>
</para>

</sect1>


<sect1 id="sect-control-functional">
<title>Functional components</title>
<para>
The presented design is limited to the
<emphasis>common real-time (infra)structure</emphasis> needed by all
these applications. Application-specific
<emphasis>functionality</emphasis> must be implemented on top of it,
via <emphasis>&ldquo;plug-ins.&rdquo;</emphasis>
This Section presents the (application-independent)
<emphasis>functional</emphasis>
parts of the generic control system. (Some of the above-mentioned
application areas don't need all of these components.) Each functional
component has a specific goal in the overall control (i.e., it runs an
application-specific <emphasis>algorithm</emphasis>), and the
interfaces between the parts are small and well defined.
</para>
<para>
An interface consists of: (i) data structures; (ii) function calls;
and (iii) events (<xref linkend="sect-events">). Data structures and
function calls can be considered as one single part of the interface,
by assuming that each access to the data takes place through a
function call. The event information in the interface specifies for
which events the component has a &ldquo;listener&rdquo; (without
saying explicitly what the listener does), and which other events it
can &ldquo;fire&rdquo;.
</para>
<para>
The following Section discusses the
<emphasis>infrastructural</emphasis> parts of the design, i.e., those
that support the functional components in their actions, but contain
no application-specific functionality themselves. The functional
components are:
<itemizedlist>

<listitem>
<para>
<emphasis>Scanner</emphasis>: measures signals on interface cards.
</para>
</listitem>

<listitem>
<para>
<emphasis>Actuator</emphasis>: writes setpoints to interface cards.
</para>
</listitem>

<listitem>
<para>
<emphasis>Generator</emphasis>: generates signal setpoints. It
supports <emphasis>hybrid signals</emphasis>, i.e., discrete signals
(&ldquo;pulses&rdquo;), analog signals (in sampled form, of course),
as well as discrete switches between analog signal forms.  In its
signal generation, it can make use of the data that other components
have available.  In control theory, one calls the functionality
offered by the Generator
&ldquo;<emphasis>feedforward</emphasis>&rdquo; and/or
&ldquo;<emphasis>setpoint generation</emphasis>.&rdquo;
</para>
</listitem>

<listitem>
<para>
<emphasis>Observer</emphasis>: reads Generator and Scanner results, and
calculates estimates on these data. Lots of application-dependent
forms of data observation exist, known under names such as
&ldquo;filtering,&rdquo; &ldquo;transformations,&rdquo;
&ldquo;data reduction,&rdquo; &ldquo;classification,&rdquo; etc.
</para>
</listitem>

<listitem>
<para>
<emphasis>Controller</emphasis>: reads Generator, Scanner and
Observer, and calculates setpoints for the Actuator,
in its &ldquo;<emphasis>control algorithm</emphasis>.&rdquo;
</para>
</listitem>

</itemizedlist>
These are the <emphasis>functional components</emphasis>, i.e., the
components of which application programmers see the plug-in interface,
and for which they must provide functional contents, in the form of
the signal generation or processing algorithms of their application.
</para>
<para>
When they are present, these functional components are
<emphasis>always</emphasis> connected according to the same structure,
depicted in <xref linkend="fig-core-comp-arch">. This figure shows the
functional components (and the infrastructural components discussed in
the following Section) as rectangular boxes. They interact through
Producer-Consumer <emphasis>mediators</emphasis>
(<xref linkend="sect-PC-mediator">), depicted by ovals.
</para>


<figure id="fig-core-comp-arch" float="1" pgwide="0">
<title>
 Structure of generic control application.
</title>
<mediaobject>
<imageobject>
<imagedata fileref="rthowtofigs/core-comp-arch.png" format="PNG">
</imageobject>
<imageobject>
<imagedata fileref="rthowtofigs/core-comp-arch.eps" format="EPS">
</imageobject>
</mediaobject>
</figure>

</sect1>


<sect1 id="des-control-infra">
<title>Infrastructural components</title>
<para>
The design also needs some <emphasis>infrastructural
components</emphasis>, that run &ldquo;behind the screens&rdquo; in
order to support the functional components, but that don't execute any
application-specific algorithms. These components are:
<itemizedlist>

<listitem>
<para>
<emphasis>Execution Engine</emphasis>: is responsible for
<emphasis>activation</emphasis> and <emphasis>configuration</emphasis>:
  <itemizedlist>

  <listitem>
  <para>
activating the functional components, respecting their individual
timing specifications.
  </para>
  </listitem>

  <listitem>
  <para>
run-time configuration of the functional components.
  </para>
  </listitem>

  </itemizedlist>
This is the only component that knows how the other components should
interact, and it triggers other components to execute their
functionality. By localizing the
<emphasis>application logic</emphasis> in one
single component, the system is much easier to understand, program,
adapt, and make deterministic. The core of the Execution Engine is a
finite state machine, whose outputs are triggers for the other
components; for the pacing of its state machine, the Execution Engine
relies on basic timer functionality of the &rtos;.
</para>
</listitem>

<listitem>
<para>
<emphasis>Command Interpreter</emphasis>: this is
<emphasis>not</emphasis> a hard real-time
component, because it receives commands (configuration, action
specification, etc.) from <emphasis>user space</emphasis> (in whatever
<emphasis>protocol</emphasis> the application uses), parses them, checks their
consistency, fills in the configuration data structures for the other
components, and signals the Execution Engine when a complete and
consistent new specification for the real-time system is available.
It has to make sure that its communication with the real-time
Execution Engine is <emphasis>atomic</emphasis>: either the whole new
specification is transferred, or nothing.
&ldquo;<emphasis>Swinging buffers</emphasis>&rdquo;
(<xref linkend="sect-swinging-buf">)
are a possible &rtos; &ipc; primitive to implement this atomicity.
</para>
</listitem>

<listitem>
<para>
<emphasis>Reporter</emphasis>: collects the data that the other components
want to send to the user, and takes care of the transmission.
</para>
</listitem>

<listitem>
<para>
<emphasis>HeartBeat</emphasis>: this component handles the timer ticks
from the operating system, and derives a
&ldquo;virtual system time&rdquo; from it. The Execution Engine asks
the HeartBeat to activate some components (i.e., to fire appropriate
events) at particular virtual time instants.
</para>
</listitem>

</itemizedlist>
</para>

</sect1>


<sect1 id="sect-control-design">
<title>Design</title>
<para>
This Section explains the design decisions behind the structure of
<xref linkend="fig-core-comp-arch">.
</para>
<para>
One major design choice is to introduce
<emphasis>maximum decoupling</emphasis> between components.
This is achieved in various ways: 
<itemizedlist>

<listitem>
<para>
Dividing the whole application into components with a 
<emphasis>minimal amount of interactions.</emphasis> The whole system
has simple &ldquo;Producer-Consumer&rdquo; interactions, and the
interaction graph has <emphasis>no loops</emphasis>. 
Execution Engine, Generator, Observer and Controller
can be designed fully independently of &rtos; and user, because they
interact only with Scanner, Actuator, Reporter and HeartBeat.
</para>
</listitem>

<listitem>
<para>
Minimizing the <emphasis>&rtos; primitives</emphasis> that each
component needs.  The HeartBeat needs input from the timer of the
&rtos;, and the mediators need locks to sequence the access to the
interaction data they encapsulate. They do need more, only in case the
system is distributed over a network, by cutting a mediator in two.
</para>
</listitem>

<listitem>
<para>
<emphasis>Localizing</emphasis> each of the component-component
interactions into a mediator object.
</para>
</listitem>

<listitem>
<para>
<emphasis>Localizing</emphasis> the &rtos; interaction in the so-called
<emphasis>operating system interface</emphasis>.<indexterm>
<primary>operating system interface</primary></indexterm>
</para>
</listitem>

<listitem>
<para>
<emphasis>Localizing</emphasis> the hardware interaction in the so-called
<emphasis>device interface</emphasis>,<indexterm>
<primary>device interface</primary></indexterm>
so that it can be ported to other platforms, or to user space, running
on virtual hardware, used for example for simulation or non-real-time
signal processing.
</para>
</listitem>

<listitem>
<para>
Using events allows the system to not rely at all on the
<emphasis>scheduler</emphasis> of the &rtos; (see next Section).
</para>
</listitem>

</itemizedlist>
<para>
Another design choice is to provide a design that can be flexibly
configured, going from everything running as one single task, even
without an operating system, to a system where each component runs an
a separate processor. This design goal has been reached as follows:
<itemizedlist>

<listitem>
<para>
<emphasis>Events.</emphasis> This is one of the best decoupling
mechanism to use at all possible levels of distribution
(<emphasis>if</emphasis> the application allows it,
<xref linkend="event-caution">). Events encompass hardware and
software interrupts, exceptions, state machine actions, CORBA events,
etc.
</para>
</listitem>

<listitem>
<para>
<emphasis>Mediators.</emphasis> Since all information about an
interaction is localized in these mediators, distributing the
mediators is all it takes to distribute the application. Everyhting
outside of the mediators remains unchanged.
</para>
</listitem>

</itemizedlist>
</para>

</sect1>


<sect1 id="sect-control-impl">
<title>Implementation</title>
<para>
A full control application may seem quite complex at first. But the
structure of the application, and the design decisions explained in
the previous Section, make a very simple and efficient implementation
possible on a single-processor system.
</para>
<para>
The key behind the implementation is that the
<emphasis>structure</emphasis> of the
application is a <emphasis>loopless graph</emphasis>. This means that
there is a deterministic way to
<emphasis>serialize</emphasis><indexterm>
<primary>serialization</primary></indexterm>
the whole execution of the control system. There are two natural
orders, <emphasis>push</emphasis>,<indexterm>
<primary>push</primary></indexterm>
and <emphasis>pull</emphasis>,<indexterm>
<primary>pull</primary></indexterm>. Push means that the execution
starts at the Scanner component, that reads the hardware, and produces
data for its mediator, that mediator then uses these &ldquo;inputs&rdquo;
to trigger the Observer, and the Generator. Then, the Controller works
and finally the Actuator. After they have done their job, the
Command Interpreter and Execution Engine are executed.
Pull is the same thing, in reverse order, starting from the Actuator.
</para>
<para>
All this &ldquo;execution&rdquo; is nothing else but running the event
handlers of the HeartBeat virtual tick event: all functionality of all
components is registered as listeners to that event. The order of the
execution of the listeners corresponds to the natural order in the
presented control design.
</para>
<para>
The serial execution above could also be executed in one single
combination of &isr;<indexterm>
<primary>&isr;</primary></indexterm>
and &dsr;,<indexterm>
<primary>&dsr;</primary></indexterm> 
(<xref linkend="sect-idsr">), where the &isr; is triggered by
the hardware timer of the system.  So, in principle, this
implementation doesn't even need an operating system, and is
appropriate for embedded implementations that require little
flexibility.
</para>
<para>
The fact that all execution can nicely be serialized deterministically
allows to use <emphasis>unprotected objects</emphasis> in the
mediators (<xref linkend="prod-cons-data-access">), again improving
efficiency because no locking is needed.
</para>

</sect1>

</chapter>

</part>

<!-- =====================P=A=R=T==IV============================= -->

<part id="part4">
<title>Tips and tricks</title>

<partintro>
<para> 
This last Part is a collection of more or less unconnected tips and
tricks, that can help application programmers to solve many of those
little annoying problems that show up in a software project. Of
course, the emphasis is again on real-time and embedded applications.
</para> 
</partintro>


<chapter id="hints">
<title>Tips and tricks</title>
<para>
TODO: 
memory barriers;
exception handling: recover, hang up or crash, error recovery vs die on the
spot; time stamps; garbage collection vs fixed size
chunks vs static allocation;
</para>


<sect1 id="hints-tasks">
<title>Tasks</title>

<para>
The term &ldquo;<emphasis>thread pool<indexterm>
<primary>thread pool</primary></indexterm></emphasis>&rdquo;
is often used in the context of servers that have to process lots of
service requests coming in asynchronously from client tasks. The term
makes one think about a company that hires workers
when it needs them and gives them a job to do.
This &ldquo;active way of distributing jobs&rdquo;
is a rather unfortunate analogy to think of programs: you shouldn't be
thinking about &ldquo;giving the threads work to do&rdquo;, but
about &ldquo;announcing that there is work to do&rdquo;. The threads
will then pick up that work when they are ready.
The Producer-Consumer mediator model is the way to go here. A request
comes in, the producer puts it on a queue, and a
consumer takes it off that queue and processes it. Consumer
threads block when there is nothing to do, and they wake up and
work when jobs become available.
</para>
<para>
The thread pool example above is one of those many occasions where
programmers create a &ldquo;manager<indexterm>
<primary>manager</primary></indexterm>&rdquo; task: that manager takes
all the decisions, such as <emphasis>actively</emphasis> deciding when
a certain task has to start and stop. But trying to start and stop
threads from an external task is error prone
<xref linkend="sect-posix-threads">. Trying to
<emphasis>delete</emphasis> another task is even more dangerous: there
is no way you can determine when another task is not involved anymore
in &ipc; with other tasks, or when it has released all of the locks it
holds on shared resources.
</para>
<para>
Determining a correct stack size for your tasks is often a difficult
job. If you have the possibility to experiment with your application
in realistic and worst-case environments, the following trick can help
you out:
<itemizedlist>

<listitem>
<para>
Allocate a quite large stack size for a task.
</para>
</listitem>

<listitem>
<para>
At creation of the task, fill the stack with a regular pattern, such
as &ldquo;123412341234&hellip;&rdquo;.
</para>
</listitem>

<listitem>
<para>
At the end of your test run, check how much of the regular pattern has
been overwritten. This is a <emphasis>lower bound</emphasis> in the
stack size this particular task should get.
</para>
</listitem>

</itemizedlist>
</para>

</sect1>


<sect1 id="hints-signals">
<title>Signals</title>
<para>
Signals and threads do not mix well. A lot of programmers start out by
writing their code under the mistaken assumption that they can set a
signal handler for each thread; but signals operate on the
<emphasis>process</emphasis>, i.e., all threads receive all signals.
One can block or unblock signals on a thread-by-thread basis, but this
is not the same thing.
</para>
<para>
However, in &linux; each thread is a process, and has its own signal
handling. &linux; executes signal handlers in the
<function>ret_from_inter</function> action (see
<filename>arch/xyz/kernel/entry.S</filename>, with
<filename>xyz</filename> the name of a particular &cpu; architecture).
</para>
<para>
If you have to deal with signals, the best you can do is to
create a special signal handling thread: its sole purpose is to handle
signals for the entire process. This thread should loop calling
<function>sigwait()</function>, and all threads (including the one
that calls sigwait) block the signals you are interested in.
This allows your system to deal with signals synchronously.
</para>
<para>
Sending signals to other threads within your own
process is not a friendly thing to do, unless you are careful with
signal masks.
</para>
<para>
Using <function>sigwait()</function> and installing signals handlers
for the signals you are sigwaiting for is a bad idea: one signal will
generate two reactions in your application, and these reactions are
hard to synchronize.
</para>
<para>
Let threads sleep on time or condition variables only: this makes
their actions on wake-up deterministic. So avoid
<function>pthread_suspend_np()</function>
and
<function>pthread_wakeup_np()</function>. &posix; didn't include these
calls because they are too easy to lead to inconsistent system, but
&unix98; has them.
</para>

</sect1>


<sect1 id="hint-condvar">
<title>Condition variables</title>
<para>
Don't mistake the (&posix;, <xref linkend="standards-posix">)
<emphasis>condition variable</emphasis> for a
<emphasis>logical condition</emphasis>: the condition variable act
like a signal, in that it is only the
<emphasis>notification</emphasis> that some logical condition
<emphasis>might</emphasis> be changed. When coming out of the
blocking, the task should check the logical condition again, because
the signaling through the condition variable doesn't guarantee
anything about the value of the logical condition. 
Have a look at the example in <xref linkend="sect-condvar">.
</para>

</sect1>


<sect1 id="hint-lock">
<title>Locks</title>
<para>
Application programmers are responsible for acquiring and releasing
locks; they cannot expect much help from programming tools or from the
operating system to use locks efficiently and effectively. It is
indeed <emphasis>very</emphasis> difficult to interpret automatically
the <emphasis>purpose</emphasis> of a lock, i.e., locks are really
part of the <emphasis>semantics</emphasis> of a program, and much less of its
<emphasis>syntax</emphasis>. Moreover, locks work only when
<emphasis>all</emphasis> tasks that access the resource obey the
(non-enforceable) lock: any task can just decide not to check the lock and
access the resource, without the operating system or other tasks
being able to prevent it.
</para>
<para>
The programmer should think about the following when using locks:
<itemizedlist>
 <listitem>
 <para>
Make sure the sections protected by locks are as short as
possible, <emphasis>and</emphasis> remain buried in the operating
system code, or in <emphasis>objects</emphasis> (encapsulated data
types) in the application's support libraries or components.
 </para>
 </listitem>

 <listitem>
 <para>
Make sure interrupt routines do not share locks with non-interrupt
code. If this condition is not satisfied, the interrupt routine can
block on the lock, or the non-interrupt task that sets a lock can
never be sure that an interrupt routine will not enter its critical
section. Here is an example that leads to a deadlock:
<programlisting>
<![CDATA[
lock lock_A;
        ...
        // in task A:
        get_lock(lock_A);
        ...
                // Here, an interrupt routine comes in
                // and tries to get the same lock:
                get_lock(lock_A);
                ...
]]>

</programlisting>
 </para>
 </listitem>

 <listitem>
 <para>
Use locks only <emphasis>locally</emphasis> (i.e., in at most two
tasks, and without nesting) and <emphasis>focused</emphasis> (i.e.,
use one lock for one purpose only, and give it a relevant name).
Although this is not a strict requirement, violating it leads to
complex code, which is error-prone and difficult to maintain and
extend.
 </para>
 </listitem>

 <listitem>
 <para>
Place lock and protected data in the same data structure. They
<emphasis>really</emphasis> belong together, to form a
&ldquo;protected object&rdquo;.
 </para>
 </listitem>

 <listitem>
 <para>
If interrupt routines and kernel or user tasks share critical sections
(which they shouldn't!), the latter ones should <emphasis>disable
interrupts</emphasis> when entering the critical section.  Again, many
processors make this kind of combined operation available (test and
set lock, disable interrupts) in an atomic version. But be aware of
its cost!
 </para>
 </listitem>

 <listitem>
 <para>
Never use a recursive mutex with condition variables because the
implicit unlock performed for a
<function>pthread_cond_wait()</function> or
<function>pthread_cond_timedwait()</function> might not actually
release the mutex. In that case, no other thread can satisfy the
condition of the predicate.
 </para>
 </listitem>

</itemizedlist>
</para>

</sect1>

<sect1 id="hints-inter">
<title>Interrupts</title>

<para>
The correct place to call <function>request_irq()</function> is when
the device is first opened, before the hardware is instructed to
generate interrupts. The
place to call <function>free_irq()</function> is the last time the
device is closed, after the hardware is told not to interrupt the
processor any more. The disadvantage of this technique is that you
need to keep a per-device open count. Using the module count isn't
enough if you control two or more devices from the same module.
&hellip;
</para>

<para>
In some operating systems, interrupt code runs on the stack of
whatever task was running when the interrupt happened. This
complicates the programmer's job of choosing an appropriate stack size
for tasks.
</para>

</sect1>


<sect1 id="hints-memory">
<title>Memory</title>

<para>
Some peripheral devices use <emphasis>Direct Memory
Access</emphasis><indexterm>
<primary>Direct Memory Access</primary></indexterm>
(DMA)<indexterm>
<primary>DMA</primary></indexterm>
(<xref linkend="sect-shared-mem">)
Often, it's a practical problem to get enough
<emphasis>contiguous</emphasis> memory, e.g., the device expects to be
able to dump its data to 2 megabytes of RAM without
&ldquo;holes&rdquo; in the address range, while the operating system
doesn't have such a big chunk of physical RAM. 
</para>
<para>
One way to solve this problem is to set aside a part of the available
RAM at boot time, <xref linkend="sec-shared-mem-linuxI">. This means
that the operating system will not use that RAM for anything, such
that an application can use it. Of course, if your application has
several tasks that want to use this RAM, you have to do the memory
management yourself. As an example, the &linux; operating system
allows a boot parameter option as follows:
<programlisting>
linux mem=128M
</programlisting>
indicating that only 128 of the available megabytes will be used by
the operating system. Boot loaders, such as
<application>lilo</application> or 
<application>grub</application>, have similar configuration options.
</para>
<para>
Another approach is <emphasis>scatter/gather DMA</emphasis>:<indexterm>
<primary>scatter/gather DMA</primary></indexterm>
<indexterm>
 <primary>DMA</primary><secondary>scatter/gather</secondary>
</indexterm>
the operating system divides the physically non-contiguous DMA buffer
into a <emphasis>list</emphasis> with entries that contain (i) a
pointer to a physical page, and (ii) the amount of contiguous RAM
available at that place. Typically, all these physical pages have the
default size of your operating system, except probably the first and
the last. To initiate the DMA, you load the first pointer/size pair
from the list into the DMA controller, and program it to issue an
<emphasis>interrupt</emphasis> (<xref linkend="sect-inter-sw">) when
the DMA is done. Then, in the interrupt handler, you re-initiate the
DMA with the next pair of values from the list. This is repeated until
the list is exhausted.
</para>

</sect1>


<sect1 id="hints-design">
<title>Design</title>
<para>
Don't make use of platform-specific function calls or data structures: use
standards (e.g.; &posix;), or encapsulate platform-specific code in libraries
with a neutral &api;. Difficult!
</para>

</sect1>


<sect1 id="hints-programming">
<title>Programming</title>
<para>
The <parameter>volatile</parameter> keyword is an important feature of
the &ccc; compiler for real-time and embedded systems. These systems
most often interact with peripheral hardware, via 
<emphasis>memory-mapped I/O<indexterm>
<primary>memory-mapped I/O</primary></indexterm></emphasis>.
<indexterm>
 <primary>I/O</primary><secondary>memory-mapped</secondary>
</indexterm>
That means that the harware's registers are read from, or written to,
as if they were a couple of bytes in the normal RAM of the system.
Typically, some registers of the hardware are always read, some others
always written. And many peripheral devices use the same register for
subsequent reads or writes. Hence, the following code fragment is
typical for such an operation:
<programlisting>
<![CDATA[
char *writereg = 0xFF20000;
char byte1, byte2;

...
*writereg = byte1;
*writereg = byte2;
...
]]>
</programlisting>
Most compilers are doing lots of optimizations behind the screens. And
they will &ldquo;optimize away&rdquo; the code above to one single
write, because their reasoning is that writing to the same variable
twice in a row amounts to the same thing as only writing the last
value; indeed, the first write is overwritten immediately. But this is
not what one wants to access peripheral hardware registers.
To prevent the compiler from optmizing away these multiple write, one
should use the <parameter>volatile</parameter> qualifier in front of
the <parameter>writereg</parameter>.
</para>

<para>
<emphasis>Default initialization</emphasis>: the &ccc; standard says
that static integers are automatically initialized to zeros. This is
often used by programmers as an excuse not the initialize their
variables explicitly. The arguments being that (i) in standard &ccc;,
and (ii) explicit initialization requires a couple more bytes in the
binary. Nevertheless, explicit initialization does help other coders
to better understand your code. And remember: this implicit
initialization is nothing but a <emphasis>syntactic</emphasis> support
from the compiler, which may well lead to
<emphasis>semantic</emphasis> errors! For example, your code compiles
because a condition integers has gotten a value (zero), but the logic
of your application requires that it would have been initialized to
one.
</para>

<para>
For embedded applications, making the binaries of the loaded code as
small as possible is important. Normal compilation results in quite
some &ldquo;overhead&rdquo; in this respect, such as symbolic data,
etc. There exist various ways to make binary code smaller:
<command>strip</command>; using &ccc; or GUI libraries that are
especially designed for embedding, such as ; etc.
For example,
<ulink url="http://www.busybox.net">BusyBox</ulink> is a replacement
for most of the utilities one usually finds in the &gnu;
<filename>fileutils</filename>, <filename>shellutils</filename>, etc.;
<ulink url="http://www.uclibc.org">&mu;clibc</ulink> is a small version
of the general &ccc; library; (micro window toolkits&hellip;).
</para>

<para>
Modern &cpu;s can decide to &ldquo;optimize&rdquo; your code, by
changing the order of some statement. This means that
reads and writes can be done in different orders,
unless you take action to prevent it, such as a
<emphasis>memory barrier</emphasis>. (This is a
<emphasis>hardware</emphasis> barrier, which is different from the
software barrier in <xref linkend="sect-barrier">!) 
Operating systems do these barriers for you, in a number of
primitives, such as <link linkend="sect-mutex">mutex</link>,
<link linkend="sect-condvar">condition variable</link>, or 
<link linkend="sect-semaphore">semaphore</link>. The &posix;
specification has more to say about this
<ulink
url="http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap04.html#tag_04_10">
here
</ulink>.
</para>

</sect1>



</chapter>

</part>

<!-- ======================================================== -->

<bibliography>
<title>Bibliography</title>

<bibliodiv><title>URLs</title>

<biblioentry>
 <title>
<ulink url="http://www.tldp.org/HOWTO/KernelAnalysis-HOWTO.html">
 KernelAnalysis-HOWTO
</ulink>.
 </title>
</biblioentry>

<biblioentry>
 <title>
<ulink url="http://as400bks.rochester.ibm.com/pubs/html/as400/v5r1/ic2924/index.htm?info/apis/rzah4mst.htm">
 &posix; thread &api; concepts 
</ulink>.
 </title>
</biblioentry>

<biblioentry>
 <title>
<ulink url="http://www.opengroup.org/branding/prodstds/x98rt.htm">Product
Standard: Multi-Purpose Realtime Operating System</ulink>.
 </title>
</biblioentry>

<biblioentry>
 <title>
 The   <ulink url="http://cs-www.bu.edu/pub/ieee-rts/Home.html">IEEE
Computer Society Real-time Research Repository</ulink>.
 </title>
</biblioentry>

<biblioentry>
 <title>
 <ulink
   url="http://linux-embedded.org/howto/Embedded-Linux-Howto.html">
   Embedded Linux Howto
 </ulink>
 </title>
 <authorgroup>
  <author>
   <firstname>Sebastien</firstname>
   <surname>Huet</surname>
  </author>
 </authorgroup>
</biblioentry>

<biblioentry>
 <title>
 <ulink
   url="http://members.nbci.com/_XMCM/greyhams/linux/PowerPC-Embedded-HOWTO.html">
   Linux for PowerPC Embedded Systems HOWTO
 </ulink>
 </title>
 <authorgroup>
  <author>
   <firstname>Graham</firstname>
   <surname>Stoney</surname>
  </author>
 </authorgroup>
</biblioentry>

<biblioentry>
 <title>
 <ulink url="">Using Shared Memory in Real-Time Linux</ulink>.
 </title>
 <authorgroup>
  <author>
   <firstname>Frederick</firstname>
   <othername>M.</othername>
   <surname>Proctor</surname>
  </author>
 </authorgroup>
</biblioentry>

<biblioentry>
 <title>
   The <filename class=directory>Documentation</filename> directory of the
   &linux; kernel
 </title>
</biblioentry>

<biblioentry>
 <title>
 <ulink url="news:comp.realtime">comp.realtime</ulink>
 </title>
</biblioentry>

<biblioentry>
 <title>
 <ulink url="/www.rtlinux.org">Real-time Linux mailinglist</ulink>
 </title>
</biblioentry>

<biblioentry>
 <title>
 <ulink url="http://linas.org/linux/threads-faq.html">LinuxThread FAQ</ulink>
 </title>
</biblioentry>

<biblioentry>
 <title>
 <ulink url="http://www.lambdacs.com/cpt/FAQ.html">comp.programming.threads
  FAQ
 </ulink>
 </title>
</biblioentry>

<biblioentry>
 <title>
<ulink
url="http://www.realtime-info.be/encyc/techno/publi/faq/rtfaq.htm">Real-time
FAQ</ulink>
 </title>
</biblioentry>

<biblioentry>
 <title>
<ulink url="http://www.rtlinux.org/rtlinux.new/documents/faq.html">Real-time
Linux FAQ</ulink>.
 </title>
</biblioentry>

<biblioentry xreflabel="Locke2002">
 <title>
<ulink url="http://www.linuxdevices.com/articles/AT5698775833.html">Priority
Inheritance: The Real Story</ulink>
 </title>
 <authorgroup>
  <author>
   <firstname>Doug</firstname><surname>Locke</surname>
  </author>
 </authorgroup>
</biblioentry>

<biblioentry xreflabel="Yodaiken2002">
 <title>
<ulink url="http://www.linuxdevices.com/files/misc/yodaiken-july02.pdf">Against
priority inheritance</ulink>
 </title>
 <authorgroup>
  <author>
   <firstname>Victor</firstname><surname>Yodaiken</surname>
  </author>
 </authorgroup>
</biblioentry>

<biblioentry>
 <title>
  <ulink url="http://www.ddj.com/documents/s=897/ddj9911b/9911b.htm">Linux,
Real-Time Linux, &amp; IPC</ulink>
 </title>
 <authorgroup>
  <author>
   <firstname>Frederick</firstname><othername>M.</othername>
   <surname>Proctor</surname>
  </author>
 </authorgroup>
</biblioentry>

<biblioentry>
 <title>
  <ulink
url="http://netfilter.kernelnotes.org/kernel-hacking-HOWTO/kernel-hacking-HOWTO.html">Linux Kernel Hacking HOWTO</ulink>
 </title>
 <authorgroup>
  <author>
   <firstname>Paul</firstname><othername>Rusty</othername>
   <surname>Russell</surname>
  </author>
 </authorgroup>
</biblioentry>

<biblioentry>
 <title>
  <ulink url="http://netfilter.kernelnotes.org/unreliable-guides/kernel-locking/lklockingguide.html">Linux Kernel Locking</ulink>
 </title>
 <authorgroup>
  <author>
   <firstname>Paul</firstname><othername>Rusty</othername>
   <surname>Russell</surname>
  </author>
 </authorgroup>
</biblioentry>

<biblioentry xreflabel="Hyde97">
 <title>
  <ulink
url="http://www.ladysharrow.ndirect.co.uk/library/Progamming/The%20Art%20of%20Assembly%20Language%20Programming/Chapter%2017.htm">Interrupts on the Intel 80x86</ulink>
 </title>
 <authorgroup> 
  <author>
  <firstname>Randall</firstname> <surname>Hyde</surname>
  </author>
 </authorgroup>
</biblioentry>

<biblioentry>
 <title>
  <ulink url="http://www.macraigor.com/zenofbdm.pdf">The ZEN of BDM</ulink>
 </title>
 <authorgroup> 
  <author>
  <firstname>Craig</firstname><othername>A.</othername>
   <surname>Haller</surname>
  </author>
 </authorgroup>
</biblioentry>
<biblioentry>
 <title>
<ulink url="http://ebus.mot-sps.com/ProdCat/psp/0,1250,68376~M98645,00.html">MC68376 manuals</ulink>.
 </title>
 <authorgroup> 
  <author>
   <surname>Motorola</surname>
  </author>
 </authorgroup>
</biblioentry>

</bibliodiv>

<bibliodiv><title>Articles and books</title>

<biblioentry xreflabel="Arcomano2002">
  <title>
   <ulink
      url="http://www.tldp.org/HOWTO/KernelAnalysis-HOWTO.html">
      KernelAnalysis-HOWTO
   </ulink>
  </title>
  <authorgroup>
    <author>
      <firstname>Roberto</firstname> <surname>Arcomano</surname>
    </author>
  </authorgroup>
  <pubdate>2002</pubdate>
  <publisher> <publishername>The Linux Documentation Project</publishername> </publisher>
</biblioentry>

<biblioentry xreflabel="Barr99">
  <title>Programming embedded systems in <acronym>C</acronym> and
    &cpp;</title>
  <authorgroup>
    <author>
      <firstname>Michael</firstname> <surname>Barr</surname>
    </author>
  </authorgroup>
  <pubdate>1999</pubdate>
  <publisher> <publishername>O'Reilly</publishername> </publisher>
</biblioentry>

<biblioentry xreflabel="BrinchHansen73">
 <biblioset relation=article>
  <title>Concurrent Programming Concepts</title>
  <authorgroup>
    <author>
      <firstname>Per</firstname>
      <surname>Brinch Hansen</surname>
    </author>
  </authorgroup>
  <pagenums>223&ndash;245</pagenums>
  <volumenum>5</volumenum><issuenum>4</issuenum>
  <pubdate>1973</pubdate>
 </biblioset>
 <biblioset relation=journal>
  <title>ACM Computing Surveys</title>
 </biblioset>
</biblioentry>

<biblioentry xreflabel="BurnsWellings2001">
  <title>Real-time systems and Programming Languages</title>
  <edition>3</edition>
  <authorgroup>
    <author>
      <firstname>Alan</firstname> <surname>Burns</surname>
    </author>
    <author>
      <firstname>Andy</firstname> <surname>Wellings</surname>
    </author>
  </authorgroup>
  <pubdate>2001</pubdate>
  <publisher> <publishername>Addison-Wesley</publishername> </publisher>
</biblioentry>

<biblioentry xreflabel="posa96">
  <title>Pattern-oriented software architecture: a system of
         patterns
  </title>
  <authorgroup>
    <author>
      <firstname>Frank</firstname>
      <surname>Buschmann</surname>
    </author>
    <author>
      <firstname>Regine</firstname>
      <surname>Meunier</surname>
    </author>
    <author>
      <firstname>Hans</firstname>
      <surname>Rohnert</surname>
    </author>
  </authorgroup>
  <pubdate>1996</pubdate>
  <publisher> <publishername>Wiley Chicester</publishername> </publisher>
</biblioentry>

<biblioentry xreflabel="Dijkstra65">
 <biblioset relation=article>
  <title>Cooperating sequential processes</title>
  <authorgroup>
    <author>
      <firstname>Edsger</firstname>
      <othername>Wybe</othername>
      <surname>Dijkstra</surname>
    </author>
  </authorgroup>
  <pagenums>43&ndash;112</pagenums>
  <pubdate>1968</pubdate>
 </biblioset>
 <biblioset relation =series>
    <title>Programming Languages</title>
    <editor><firstname>F.</firstname><surname>Genuys</surname></editor>
    <publisher>
      <publishername>Academic Press</publishername>
    </publisher>
  </biblioset>
</biblioentry>


<biblioentry xreflabel="gof94">
  <title>Design Patterns Elements of Reusable Object-Oriented Software
  </title>
  <authorgroup>
    <author>
      <firstname>Erich</firstname>
      <surname>Gamma</surname>
    </author>
    <author>
      <firstname>Richard</firstname>
      <surname>Helm</surname>
    </author>
    <author>
      <firstname>Ralph</firstname>
      <surname>Johnson</surname>
    </author>
    <author>
      <firstname>John</firstname>
      <surname>Vlissides</surname>
    </author>
  </authorgroup>
  <pubdate>1994</pubdate>
  <publisher> <publishername>Addison Wesley</publishername> </publisher>
</biblioentry>

<biblioentry xreflabel="Galli2001">
 <biblioset relation=book>
  <title>Distributed Operating Systems</title>
  <authorgroup>
    <author>
      <firstname>Doreen</firstname>
      <othername role="mi">L.</othername>
      <surname>Galli</surname>
    </author>
  </authorgroup>
  <pubdate>2000</pubdate>
  <publisher>
     <publishername>Springer</publishername>
   </publisher>
  </biblioset>
</biblioentry>

<biblioentry xreflabel="Gray78">
 <biblioset relation=article>
  <title>Notes on database operating systems</title>
  <authorgroup>
    <author>
      <firstname>J.</firstname> <surname>Gray</surname>
    </author>
  </authorgroup>
  <pagenums>394&ndash;481</pagenums>
  <pubdate>1978</pubdate>
 </biblioset>
 <biblioset relation =series>
   <title>Operating systems: an advanced course</title>
   <authorgroup>
    <editor><firstname>R.</firstname><surname>Bayer</surname></editor>
    <editor><firstname>R.</firstname><surname>Graham</surname></editor>
    <editor><firstname>G.</firstname><surname>Seegmuller</surname></editor>
   </authorgroup>
   <publisher>
     <publishername>Springer</publishername>
   </publisher>
  </biblioset>
</biblioentry>

<biblioentry xreflabel="Herlihy91">
 <biblioset relation=article>
  <title>Wait free Synchronization</title>
  <authorgroup>
    <author>
      <firstname>M.</firstname> <surname>Herlihy</surname>
    </author>
  </authorgroup>
  <pagenums>124&ndash;149</pagenums>
  <volumenum>13</volumenum><issuenum>1</issuenum>
  <pubdate>1991</pubdate>
 </biblioset>
 <biblioset relation=journal>
  <title>ACM Transactions on Programming Languages and Systems</title>
 </biblioset>
</biblioentry>

<biblioentry xreflabel="Herlihy93">
 <biblioset relation=article>
  <title>A Methodology for Implementing Highly Concurrent Data Objects</title>
  <authorgroup>
    <author>
      <firstname>M.</firstname> <surname>Herlihy</surname>
    </author>
  </authorgroup>
  <pagenums>745&ndash;77</pagenums>
  <volumenum>15</volumenum><issuenum>5</issuenum>
  <pubdate>1993</pubdate>
 </biblioset>
 <biblioset relation=journal>
  <title>ACM Transactions on Programming Languages and Systems</title>
 </biblioset>
</biblioentry>

<biblioentry xreflabel="Hoare74">
 <biblioset relation=article>
  <title>Monitors, an operating system structuring concept</title>
  <authorgroup>
    <author>
      <firstname>C.A.R.</firstname> <surname>Hoare</surname>
    </author>
  </authorgroup>
  <pagenums>549&ndash;557</pagenums>
  <pubdate>1974</pubdate>
  <volumenum>17</volumenum><issuenum>10</issuenum>
 </biblioset>
 <biblioset relation=journal>
  <title>Communications of the ACM</title>
 </biblioset>
</biblioentry>

<biblioentry xreflabel="Johnson97">
 <biblioset relation=article>
  <title>Frameworks = (components + patterns)</title>
  <authorgroup>
    <author>
      <firstname>R. E.</firstname>
      <surname>Johnson</surname>
    </author>
  </authorgroup>
  <pagenums>39&ndash;42</pagenums>
  <volumenum>40</volumenum><issuenum>10</issuenum>
  <pubdate>1997</pubdate>
 </biblioset>
 <biblioset relation=journal>
  <title>Communications of the ACM</title>
 </biblioset>
</biblioentry>

<biblioentry xreflabel="LampsonRedell80">
 <biblioset relation=article>
  <title>Experiences with processes and monitors in Mesa</title>
  <authorgroup>
    <author>
      <firstname>Butler</firstname>
      <othername role="mi">W.</othername>
      <surname>Lampson</surname>
    </author>
    <author>
      <firstname>Redell</firstname>
      <othername role="mi">W.</othername>
      <surname>David</surname>
    </author>
  </authorgroup>
  <pagenums>105&ndash;117</pagenums>
  <pubdate>1980</pubdate>
  <volumenum>23</volumenum><issuenum>2</issuenum>
 </biblioset>
 <biblioset relation=journal>
  <title>Communications of the ACM</title>
 </biblioset>
</biblioentry>

<biblioentry xreflabel="Mealy55">
 <biblioset relation=article>
  <title>A method for synthesizing sequential circuits</title>
  <authorgroup>
    <author>
      <firstname>G.-H.</firstname>
      <surname>Mealy</surname>
    </author>
  </authorgroup>
  <pagenums>1045&ndash;1079</pagenums>
  <pubdate>1955</pubdate>
  <volumenum>34</volumenum><issuenum>5</issuenum>
 </biblioset>
 <biblioset relation=journal>
  <title>Bell System Technical Journal</title>
 </biblioset>
</biblioentry>

<biblioentry xreflabel="Lewine91">
  <title>POSIX Programmer's Guide: Writing Portable UNIX Programs</title>
  <authorgroup>
    <author>
      <firstname>Donald</firstname> <surname>Lewine</surname>
    </author>
  </authorgroup>
  <pubdate>1991</pubdate>
  <publisher> <publishername>O'Reilly</publishername> </publisher>
</biblioentry>

<biblioentry xreflabel="Nutt2000">
  <title>Operating systems : a modern perspective</title>
  <authorgroup>
    <author>
      <firstname>Gary J.</firstname> <surname>Nutt</surname>
    </author>
  </authorgroup>
  <pubdate>2000</pubdate>
  <publisher> <publishername>Addison-Wesley</publishername> </publisher>
</biblioentry>


<biblioentry xreflabel="Rubini2001">
  <title><ulink url="http://www.oreilly.com/catalog/linuxdrive2/">Linux Device
    Drivers</ulink></title>
  <edition>2</edition>
  <authorgroup>
    <author>
      <firstname>Alessandro</firstname>
      <surname>Rubini</surname>
    </author>
    <author>
      <firstname>Jonathan</firstname>
      <surname>Corbet</surname>
    </author>
  </authorgroup>
  <pubdate>2001</pubdate>
  <publisher> <publishername>O'Reilly</publishername> </publisher>
</biblioentry>

<biblioentry xreflabel="Sakamura98">
  <title>&mu;ITRON 3.0</title>
  <subtitle>An Open and Portable Real-Time Operating System
  for Embedded Systems</subtitle>
  <authorgroup>
    <author>
      <firstname>Ken</firstname> <surname>Sakamura</surname>
    </author>
  </authorgroup>
  <pubdate>1998</pubdate>
  <publisher>
   <publishername>IEEE Computer Society</publishername>
   </publisher>
</biblioentry>

<biblioentry xreflabel="Simon99">
  <title>An Embedded Software Primer</title>
  <authorgroup>
    <author>
      <firstname>David</firstname><othername>E.</othername>
      <surname>Simon</surname>
    </author>
  </authorgroup>
  <pubdate>1999</pubdate>
  <publisher> <publishername>Addison-Wesley</publishername> </publisher>
</biblioentry>

<biblioentry xreflabel="Stevens99">
  <title>UNIX Network Programming. Interprocess Communications</title>
  <authorgroup>
    <author>
      <firstname>W.</firstname>
      <othername>Richard</othername>
      <surname>Stevens</surname>
    </author>
  </authorgroup>
  <pubdate>1999</pubdate>
  <publisher> <publishername>Prentice-Hall</publishername> </publisher>
 <bibliomisc>
   <ulink url="http://www.kohala.com/start/unpv22e/unpv22e.html">
    http://www.kohala.com/start/unpv22e/unpv22e.html
   </ulink>
 </bibliomisc>
</biblioentry>

<biblioentry xreflabel="Walmsley2000">
 <biblioset relation=book>
  <title>Multi-threaded programming in &cpp;</title>
  <authorgroup>
    <author>
      <firstname>Mark</firstname> <surname>Walmsley</surname>
    </author>
  </authorgroup>
  <pubdate>2000</pubdate>
  <publisher> <publishername>Springer</publishername> </publisher>
  </biblioset>
</biblioentry>

</bibliodiv>
</bibliography>

<!--  &index; -->

</book>


