% arabdoc5.tex %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section {Input coding conventions}\label {coding}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The ASCII input notation for Arabic text has been modelled closely after 
the transliteration standards ISO/R 233 and DIN 31 635.
As these standards do not guarantee unique re-transliteration
and are also not 7-bit ASCII compatible, some modifications were necessary.
These follow the general rules:
                                                  \index{transliteration}
                                                  \index{coding conventions}

\begin{itemize}

\item whenever the transliteration uses a single letter, code that letter;

\item whenever the transliteration uses a letter with a diacritical mark,
  put the punctuation character most closely resembling the diacritical mark
  {\em before} the letter
  (and {\em not\/} behind it as in some other coding proposals,
  as otherwise the readability of the input would suffer).

\item use capital letters for writing variants

\end{itemize}

\subsection {Standard Arabic and Persian characters}
%---------------------------------------

\begin{table}[ht]
\begin{center}
\setarab \novocalize
\begin{tabular}{|r|c|c|c||r|c|c|c||r|c|c|c|}
\hline \astrut
\atabii a \alif & \atabii b \bah & \atabii p \pah \\
\hline \astrut
\atabii t \tah & \atabii _t \thah & \atabii ^g \gim \\
\hline \astrut
\atabii .h \hhah & \atabii _h \khah & \atabii d \dal \\
\hline \astrut
\atabii _d \dhal & \atabii r \rah & \atabii z \zay \\
\hline \astrut
\atabii s \sin & \atabii ^s \shin & \atabii .s \sad \\
\hline \astrut
\atabii .d \dad & \atabii .t \ttah & \atabii .z \zzah \\
\hline \astrut
\atabii ` \ain & \atabii .g \ghain & \atabii f \fah \\
\hline \astrut
\atabii q \qaf & \atabii v \vah & \atabii k \kaf \\
\hline \astrut
\atabii g \gaf & \atabii l \lam & \atabii m \mim \\
\hline \astrut
\atabii n \nun & \atabii h \hah & \atabii w \waw \\
\hline \astrut
\atabii y \yah & \atabii _A \alif & \atabii T \tah \\
&&&&&&& \maqsura &&&& \marbuta \\
\hline
\end{tabular}
\end{center}
\caption{Standard codings for Arabic and Persian.}
\label{standard}
\end{table}

\begin{table}[htb]
\begin{center}
\setarab \novocalize
\begin{tabular}{|r|c|c|l|}
\hline    \astrut
\atabii c       {\hhah\ with \hamza}
\\ \hline \astrut
\atabii ^c      {\gim\ with three dots (below)}
\\ \hline \astrut
\atabii ,c      {\khah\ with three dots (above)}
\\ \hline \astrut
\atabii ^z      {\zay\ with three dots (above)}
\\ \hline \astrut
\atabii ~n      {\kaf\ with three dots (Ottoman)}
\\ \hline \astrut
\atabii ~l      {\lam\ with a bow accent (Kurdish)}
\\ \hline \astrut
\atabii .r      {\rah\ with two bows (Kurdish)}
\\ \hline
\end{tabular}
\end{center}
\caption{Additional codings generally available.}
\label{extended}
\end{table}

The standard codings for Arabic and Persian are given
in Table~\ref{standard} and Table~\ref{extended}.

\begin{itemize}
\item For long vowels, use the capital letters <A>, <I>, <U>
  or <aa>, <iy>, <uw>.%
                                                  \index{long vowels}
                                                  \index{vowels!long}
                                                  \index{A}
                                                  \index{I}
                                                  \index{U}
                                                  \index{aa}
                                                  \index{iy}
                                                  \index{uw}
\item To get the defective writing of long vowels, use <_a>, <_i>, <_u>.
                                                  \index{defective writing}
                                                  \index{.\alif!dagger}
                                                  \index{dagger \alif}
                                                  \index{_a}
                                                  \index{_i}
                                                  \index{_u}
\item {\em 'Alif} \maqsura\ is <_A> or~<Y>.
                                                  \index{.\alif!.\maqsura}
                                                  \index{Y}
                                                  \index{_A}
\item The short vowels \fatha, \kasra, \damma\ are coded <a>, <i>, <u>
and need not normally be written except in the following cases:
                                                  \index{short vowels}
                                                  \index{a (\fatha)}
                                                  \index{i (\kasra)}
                                                  \index{u (\damma)}
                                                  \index{.\fatha}
                                                  \index{.\kasra}
                                                  \index{.\damma}
                                                  \index{vowels!short}
                                                  \index{.\harakat}
\begin{itemize}
  \item at the beginning of a word where they generate \alif,
  \item adjacent to \hamza\ where they will influence its carrier,
                                                  \index{.\hamza}
  \item when the transliteration is required,
  \item in the \verb+\fullvocalize+ mode.
                                                  \index{\fullvocalize}
\end{itemize}

\item {\em Tanw\={\i}n} is coded <aN>, <iN>, or~<uN>.
  A silent \alif, if required, is supplied automatically;
  it may also be explicitly written: <aNA>.
  Likewise, a silent \waw\ may be written <NU>
  as in <`amruNU>.
                                                  \index{.\tanween}
                                                  \index{aN}
                                                  \index{iN}
                                                  \index{uN}
                                                  \index{aNA}
                                                  \index{NU}

\item \hamza\ is denoted by a single {\em right\/} quote <'>.
                                                  \index{' (\hamza)}
                                                  \index{.\hamza}
                                                  \index{\setarab}
  After selecting a language by \verb"\setarab" etc.,
  the \hamza\ carrier will be determined from the context 
  according to the rules for writing Arabic words;
  if that is not wanted, ``quote'' 
  the \hamza\ (see Section~\ref {quote}\ below).
                                                  \index{quoting}
  In the \verb"\setverb" mode,
  the \hamza\ carrier is determined by the following letter;
  see Section~\ref {verbatim}.

\item \madda\ on \alif\ is generated by a right quote
  (\hamza) before <A>: <'A>. 
%  \madda\ on \alif\ 
  It may also be written <~A>; 
  likewise, <~I> and <~U> will produce \madda\ on \yah\ 
  and on \waw, as required in some older writing conventions.
                                                  \index{'A}
                                                  \index{~A}
                                                  \index{~I}
                                                  \index{~U}
                                                  \index{.\madda}
\item The coding <`> for \ain\ is a single {\em left\/} quote,
  beware of confusing it with \hamza!
                                                  \index{` (\ain)}
                                                  \index{.\ain}
\item The ``invisible consonant'' <|> may be inserted in order to
  break unwanted ligatures and to influence the \hamza\ writing.
  It will not show in the Arabic output or in the transliteration.
  At the beginning of a word it will suppress a following short vowel;
  otherwise it acts like a consonant.
                                                  \index{"|}
                                                  \index{invisible consonant}
                                                  \index{ligature!breaking}
\item The sequence <||> will insert a small space, as does <"|>
  (see Section~\ref {quote}\ below).
  The adjacent characters will not be connected.
                                                  \index{"|"|}
                                                  \index{"""|}
\item {\em \v{S}adda\/} is indicated by doubling the appropriate letter coding.
                                                  \index{.\tashdid}
                                                  \index{.\shadda}
\item The definite article is separated from the following word by a hyphen.
  It may be written in the assimilated form (if it exists): <as-salaamu>,
  or always as <al->;
  in that case a subsequent ``sun letter'' must be doubled: <al-ssalaamu>,
  to receive a \shadda,
  and to prevent a \sukun\ on the~\lam.
  The transliteration in both cases is identical.
                                                  \index{al-}
                                                  \index{definite article}
                                                  \index{assimilation}
                                                  \index{sun letter}
                                                  \index{.\shadda}
                                                  \index{.\sukun!on \lam}
\item Hyphens <-> are used for tying words together, or for indicating
  a connecting vowel in Arabic, or an \izafet\ connection in Persian.
  They may be used freely, and generally do not change the writing,
  but will show up in the transliteration.
  Additionally, at the beginning and the end of an otherwise isolated word
  they enforce the use of the connecting form of the adjacent letter
  (if it exists), like e.g. in the date <1400 h->.
                                                  \index{hyphen}
                                                  \index{.\izafet}
                                                  \index{h-}
                                                  \index{date}
\item A double hyphen <--> between two otherwise joining letters
  will break any ligature and will insert a horizontal stroke
  (\tatweel, \kashida) without appearing in the transliteration.
  It may be used repeatedly.
  See also Section~\ref{misc}: automatic stretching.
                                                  \index{--}
                                                  \index{.\tatweel}
                                                  \index{.\kashida}
                                                  \index{stretching}
                                                  \index{ligature!breaking}

  For special applications, it can also be coded <B>; and <|B> will
  behave like an ordinary consonant and may carry vowel indicators,
  \tanween, \sukun, and, in the combination <|BB>: \shadda.
                                                  \index{B}
                                                  \index{"|B}
                                                  \index{"|BB}
                                                  \index{.\harakat}
                                                  \index{.\sukun!on \tatweel}
                                                  \index{.\tanween!on \tatweel}
                                                  \index{.\shadda!on tatweel}
                                                  \index{.\harakat!on \tatweel}

\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection {Quoting}\label {quote}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In \verb"\novocalize" mode
(see Section~\ref {vocalization}),
a double quote <"> will modify the meaning
of the following character as follows:
                                                  \index{"" (quoting)}
                                                  \index{quoting}
                                                  \index{\novocalize}

\begin{itemize}

\item if a short vowel follows,
  the appropriate diacritical mark \fatha, \kasra, \damma\
  will be put on the preceding character.
                                                  \index{.\fatha}
                                                  \index{.\kasra}
                                                  \index{.\damma}
                                                  \index{.\harakat}
\begin{itemize}
\item If <N> follows the short vowel,
  the appropriate form of \tanween\ will be generated instead.
                                                  \index{N}
                                                  \index{.\tanween}
\item At the beginning of a word,
  \alif\ is assumed as the first character.

%\item If the preceding word ended with a vowel,
%2  \wasla\ is generated instead of the vowel indicator.
%  If that is not wanted, append <|> to the preceding word.
\end{itemize}

\item if the following character is a single right quote,
  a \hamza\ mark will be put on the preceding character
  even if in conflict with the \hamza\ rules.

  At the beginning of a word, an isolated \hamza\ will be generated.
                                                  \index{.\hamza}
\item if the following character is the ``invisible consonant'' <|>,
  the connection between the adjacent letters will be broken
  and a small space inserted. This can also be denoted <||> instead
  of~<"|>.

  At the beginning of a word, \alif\ with \wasla\
  will be generated.
                                                  \index{"|}
                                                  \index{breaking connections}
                                                  \index{"|"|}
                                                  \index{"""|}
                                                  \index{.\wasla}
\item otherwise: a \sukun\ will be put on the preceding character.
  The following character will be processed again.
                                                  \index{.\sukun}

\end{itemize}

The double quote will not show up in the transliteration.

In \verb"\vocalize" mode,
(see Section~\ref {vocalization}),
quoting will turn a short vowel off;
likewise, in \verb"\fullvocalize" mode, quoting will also turn
a \sukun\ off.
Put differently: quoting will toggle the generation
of short vowel indicators and \sukun\ on and off.
                                                  \index{\vocalize}
                                                  \index{\fullvocalize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection {Ligatures}\label {ligatures}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

There is no way to explicitly enforce ligatures
as a large number of them are generated automatically.
The results will not always look satisfactory, so we recommend 
inspecting the output after the first run.
Any unwanted ligature can be suppressed by interposing the invisible
character <|> between the two letters otherwise combined into a ligature.
After \verb"\ligsfalse", in the middle of a word fewer ligatures
will be produced; for some texts this looks better.
You can return to the normal strategy by \verb"\ligstrue".
                                                  \index{ligature}
                                                  \index{ligature!breaking}
                                                  \index{"|}
                                                  \index{\ligsfalse}
                                                  \index{\ligstrue}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection {Vowelization}\label {vocalization}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

There are three modes of rendering short vowels:

\begin{itemize}
\item \verb+\fullvocalize+:
                                                  \index{\fullvocalize}
\begin{itemize}
  \item Every short vowel written will generate the corresponding 
  diacritical mark \fatha, \kasra, \damma, except if quoted.
                                                  \index{diacritics}
                                                  \index{vowel marks}
                                                  \index{.\harakat}
                                                  \index{.\fatha}
                                                  \index{.\kasra}
                                                  \index{.\damma}
  \item If <N> follows a short vowel,
    the corresponding form of \tanween\ is generated instead.
                                                  \index{N}
                                                  \index{.\tanween}
  \item Defective writing:
    The coding <_a> will produce a {\em Qur'an} \alif\ accent
    (also called {\em dagger 'alif\/}) instead of an explicit \alif\ 
    character which would be coded <A> or~<aa>.
    Likewise, <_i> will produce a small \alif\ below the preceding
    consonant in place of <I>~(<iy>), 
    and~<_u> will produce an inverted \damma\
    in place of <U>~(<uw>).
                                                  \index{_a}
                                                  \index{Qur'an \alif}
                                                  \index{.\alif!Qur'an}
                                                  \index{.\alif!dagger}
                                                  \index{.\alif!small}
                                                  \index{.\alif!small!below}
                                                  \index{.\damma!inverted}
                                                  \index{inverted \damma}
                                                  \index{dagger \alif}
                                                  \index{_i}
                                                  \index{_u}
                                                  \index{defective writing}
  \item If a long vowel follows a consonant, the corresponding short vowel 
    is implied. The long vowel itself carries no diacritical mark.
                                                  \index{long vowels}
                                                  \index{vowels!long}
  \item If no vowel is given after a consonant, \sukun\ will be generated
    except if a double quote precedes the next consonant.
    The \lam\ of the definite article receives no \sukun\
    if a double ``sun letter'' follows.
                                                  \index{.\sukun}
                                                  \index{assimilation}
  \item \alif\ at the beginning of a word carries \wasla\
    instead of the vowel indicator if the preceding word ended with a vowel.
                                                  \index{.\wasla}
\end{itemize}

\item \verb+\vocalize+:
  As above, but \sukun\ and \wasla\ will not be generated
  except if explicitly indicated by ``quoting''.
                                                  \index{\vocalize}
                                                  \index{.\sukun}
                                                  \index{.\wasla}
                                                  \index{quoting}
\item \verb+\novocalize+:
  No diacritics will be generated
  except if explicitly asked for by ``quoting''.
                                                  \index{\novocalize}
                                                  \index{diacritics}
                                                  \index{.\harakat}
                                                  \index{quoting}

\end{itemize}

In all modes, a double consonant will generate \shadda,
and <'A> always generates \madda\ on~\alif.
                                                  \index{.\shadda}
                                                  \index{.\tashdid}
                                                  \index{'A}
                                                  \index{.\madda}

After <aN> the silent \alif\ character is generated if necessary.
The silent \alif\ may also be explicitly indicated by~<aNA>,
or coded literally as~<A> in \verb+\novocalize+ mode.
If a silent \alif\ \maqsura\ is wanted instead,
write <aN_A>, <aNY>, <_A> or~<Y>.
                                                  \index{.\tanween}
                                                  \index{aN}
                                                  \index{aNA}
                                                  \index{silent \alif}
                                                  \index{.\alif!silent}
                                                  \index{\novocalize}
                                                  \index{aN_A}
                                                  \index{_A}
                                                  \index{.\alif!.\maqsura}
                                                  \index{.\alif!.\maqsura!silent}

The \tanween\ \fatha\ is normally put on the last consonant of the word,
even if a silent \alif\ follows. If it is instead supposed to go
onto the \alif\ as in some modern Arabic conventions, or in Persian,
this behaviour can be achieved by the option \verb"\newtanwin".
The option \verb"\oldtanwin" will restore the classical behaviour.%
                                                  \index{.\tanween}
                                                  \index{\newtanwin}
                                                  \index{\oldtanwin}

A silent \alif\ after \waw\ is indicated by <UA>
or~<WA> (with a capital~<W>!).
                                                  \index{UA}
                                                  \index{WA}
                                                  \index{silent \alif}
                                                  \index{.\alif!silent}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection {Verbatim input}\label{verbatim}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{table}[htb]
\begin{center}
\setverb \novocalize
\begin{tabular}{|r|c|l||r|c|l|}
\hline \astrut
\atabl 'a 'a {\hamza\ on \alif} & \atabl 'i 'i {\hamza\ below \alif} \\
\hline \astrut
\atabl 'w 'w {\hamza\ on \waw} & \atabl 'y 'y- {\hamza\ on a tooth} \\
\hline \astrut
\atabl 'h 'h {\hamza\ on \hah} & \atabl 'B 'B {\hamza\ on the line} \\
\hline \astrut
\atabl '| '| {isolated \hamza} & \atabl 'A 'A {\madda\ on \alif} \\
\hline
\end{tabular}
\end{center}
\caption{Verbatim codings for the carrier of \hamza}
\label{verbhamza}
\end{table}

After disabling language specific processing by \verb"\setverb"
or \verb"\setnone", \ArabTeX\ will not use any context information
to determine the carrier of \hamza. Instead the user has to
supply this information himself by the next character typed after <'>.
Generally this character will be used as the carrier; for examples
and some exceptions see Table~\ref{verbhamza}.
A short vowel indicator may follow.
                                                  \index{verbatim}
                                                  \index{.\hamza!carrier}

To ease automatic conversion, an initial \alif\ may also be coded <A>.
                                                  \index{A}
                                                  \index{.\alif}
                                                  \index{.\alif!initial}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection {Alternate input codings}\label {altinput}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The \ArabTeX\ input notation has been very carefully designed for
flexibility, readability, and ease of use for linguists confined to
standard 7-bit ASCII equipment for processing and transmitting data.
However, it does not make much sense recoding existing machine-readable
text files coded according to other standards.
Thus, some alternate reading modules have been written
(as there are more than 10 different codings in current use,
this is an open-ended activity),
and a general code switching procedure has been provided.
                                                  \index{reading module}
                                                  \index{input switching}

An alternate reading module, e.g. {\tt asmo449.sty} for the ASMO~449 code,
is installed by adding its name ({\tt asmo449}) as a \LaTeX\ style option,
or by \verb"\input asmo449.sty".
Afterwards, a {\em code\_name\/} 
(in this case \verb"asmo449") is defined.
                                                  \index{ASMO 449}
                                                  \index{option!asmo449}
                                                  \index{option!iso88596}
                                                  \index{code!ASMO 449}
                                                  \index{code!arabtex}
                                                  \index{code!ISO 8859-6}
                                                  \index{code!ISO 9036}

Input coding is switched by the command
\verb"\setcode {"{\em code\_name\/}\verb"}"
that changes the coding for {\em Arabic text} globally,
or by the environment \verb"\begin {setcode}{"{\em code\_name\/}\verb"}"
$\cdots$ \verb"\end {setcode}" 
which follows the normal \TeX\ grouping rules.
                                                  \index{\setcode}
                                                  \index{\begin{setcode}}
                                                  \index{\end{setcode}}
                                                  \index{environment!setcode}

Coding may be switched several times in the same document, provided the
appropriate reading modules are installed;
\verb"\setcode {arabtex}" reverts to the standard \ArabTeX\ notation.
                                                  \index{\setcode{arabtex}}

Please observe that {\em only Arabic text\/} is affected by
\verb"\setcode {"{\em code\_name\/}\verb"}";
text outside of {\em Arabic contexts,}
and control sequence names, are still assumed to be in 7-bit ASCII.
As existing text files presumably do not contain
any control sequences or non-Arabic text anyway,
we suggest using a small ASCII \TeX/\LaTeX\ driver file setting all
relevant options and containing any non-Arabic text,
and calling the Arabic text files
by \verb"\input {"{\em file\_name\/}\verb"}"
from within an {\em Arabic environment\/}.
                                                  \index{\input}
                                                  \index{environment!arabtext}
                                                  \index{environment!Arabic}

For details on available additional reading modules, 
see Appendix~\ref {scanners}.

\endinput
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

