Current File : //usr/local/apps/perl/man/man3/utf8.3
.\" Automatically generated by Pod::Man 4.11 (Pod::Simple 3.35)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
.    ds C`
.    ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is >0, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.nr rF 0
.if \n(.g .if rF .nr rF 1
.if (\n(rF:(\n(.g==0)) \{\
.    if \nF \{\
.        de IX
.        tm Index:\\$1\t\\n%\t"\\$2"
..
.        if !\nF==2 \{\
.            nr % 0
.            nr F 2
.        \}
.    \}
.\}
.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "utf8 3"
.TH utf8 3 "2019-05-11" "perl v5.30.0" "Perl Programmers Reference Guide"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
utf8 \- Perl pragma to enable/disable UTF\-8 (or UTF\-EBCDIC) in source code
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 2
\& use utf8;
\& no utf8;
\&
\& # Convert the internal representation of a Perl scalar to/from UTF\-8.
\&
\& $num_octets = utf8::upgrade($string);
\& $success    = utf8::downgrade($string[, $fail_ok]);
\&
\& # Change each character of a Perl scalar to/from a series of
\& # characters that represent the UTF\-8 bytes of each original character.
\&
\& utf8::encode($string);  # "\ex{100}"  becomes "\exc4\ex80"
\& utf8::decode($string);  # "\exc4\ex80" becomes "\ex{100}"
\&
\& # Convert a code point from the platform native character set to
\& # Unicode, and vice\-versa.
\& $unicode = utf8::native_to_unicode(ord(\*(AqA\*(Aq)); # returns 65 on both
\&                                               # ASCII and EBCDIC
\&                                               # platforms
\& $native = utf8::unicode_to_native(65);        # returns 65 on ASCII
\&                                               # platforms; 193 on
\&                                               # EBCDIC
\&
\& $flag = utf8::is_utf8($string); # since Perl 5.8.1
\& $flag = utf8::valid($string);
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
The \f(CW\*(C`use utf8\*(C'\fR pragma tells the Perl parser to allow \s-1UTF\-8\s0 in the
program text in the current lexical scope.  The \f(CW\*(C`no utf8\*(C'\fR pragma tells Perl
to switch back to treating the source text as literal bytes in the current
lexical scope.  (On \s-1EBCDIC\s0 platforms, technically it is allowing UTF-EBCDIC,
and not \s-1UTF\-8,\s0 but this distinction is academic, so in this document the term
\&\s-1UTF\-8\s0 is used to mean both).
.PP
\&\fBDo not use this pragma for anything else than telling Perl that your
script is written in \s-1UTF\-8.\s0\fR The utility functions described below are
directly usable without \f(CW\*(C`use utf8;\*(C'\fR.
.PP
Because it is not possible to reliably tell \s-1UTF\-8\s0 from native 8 bit
encodings, you need either a Byte Order Mark at the beginning of your
source code, or \f(CW\*(C`use utf8;\*(C'\fR, to instruct perl.
.PP
When \s-1UTF\-8\s0 becomes the standard source format, this pragma will
effectively become a no-op.
.PP
See also the effects of the \f(CW\*(C`\-C\*(C'\fR switch and its cousin, the
\&\f(CW\*(C`PERL_UNICODE\*(C'\fR environment variable, in perlrun.
.PP
Enabling the \f(CW\*(C`utf8\*(C'\fR pragma has the following effect:
.IP "\(bu" 4
Bytes in the source text that are not in the \s-1ASCII\s0 character set will be
treated as being part of a literal \s-1UTF\-8\s0 sequence.  This includes most
literals such as identifier names, string constants, and constant
regular expression patterns.
.PP
Note that if you have non-ASCII, non\-UTF\-8 bytes in your script (for example
embedded Latin\-1 in your string literals), \f(CW\*(C`use utf8\*(C'\fR will be unhappy.  If
you want to have such bytes under \f(CW\*(C`use utf8\*(C'\fR, you can disable this pragma
until the end the block (or file, if at top level) by \f(CW\*(C`no utf8;\*(C'\fR.
.SS "Utility functions"
.IX Subsection "Utility functions"
The following functions are defined in the \f(CW\*(C`utf8::\*(C'\fR package by the
Perl core.  You do not need to say \f(CW\*(C`use utf8\*(C'\fR to use these and in fact
you should not say that unless you really want to have \s-1UTF\-8\s0 source code.
.IP "\(bu" 4
\&\f(CW\*(C`$num_octets = utf8::upgrade($string)\*(C'\fR
.Sp
(Since Perl v5.8.0)
Converts in-place the internal representation of the string from an octet
sequence in the native encoding (Latin\-1 or \s-1EBCDIC\s0) to \s-1UTF\-8.\s0 The
logical character sequence itself is unchanged.  If \fI\f(CI$string\fI\fR is already
upgraded, then this is a no-op. Returns the
number of octets necessary to represent the string as \s-1UTF\-8.\s0
.Sp
If your code needs to be compatible with versions of perl without
\&\f(CW\*(C`use feature \*(Aqunicode_strings\*(Aq;\*(C'\fR, you can force Unicode semantics on
a given string:
.Sp
.Vb 3
\&  # force unicode semantics for $string without the
\&  # "unicode_strings" feature
\&  utf8::upgrade($string);
.Ve
.Sp
For example:
.Sp
.Vb 7
\&  # without explicit or implicit use feature \*(Aqunicode_strings\*(Aq
\&  my $x = "\exDF";    # LATIN SMALL LETTER SHARP S
\&  $x =~ /ss/i;       # won\*(Aqt match
\&  my $y = uc($x);    # won\*(Aqt convert
\&  utf8::upgrade($x);
\&  $x =~ /ss/i;       # matches
\&  my $z = uc($x);    # converts to "SS"
.Ve
.Sp
\&\fBNote that this function does not handle arbitrary encodings\fR;
use Encode instead.
.IP "\(bu" 4
\&\f(CW\*(C`$success = utf8::downgrade($string[, $fail_ok])\*(C'\fR
.Sp
(Since Perl v5.8.0)
Converts in-place the internal representation of the string from \s-1UTF\-8\s0 to the
equivalent octet sequence in the native encoding (Latin\-1 or \s-1EBCDIC\s0). The
logical character sequence itself is unchanged. If \fI\f(CI$string\fI\fR is already
stored as native 8 bit, then this is a no-op.  Can be used to make sure that
the \s-1UTF\-8\s0 flag is off, e.g. when you want to make sure that the \fBsubstr()\fR or
\&\fBlength()\fR function works with the usually faster byte algorithm.
.Sp
Fails if the original \s-1UTF\-8\s0 sequence cannot be represented in the
native 8 bit encoding. On failure dies or, if the value of \fI\f(CI$fail_ok\fI\fR is
true, returns false.
.Sp
Returns true on success.
.Sp
If your code expects an octet sequence this can be used to validate
that you've received one:
.Sp
.Vb 2
\&  # throw an exception if not representable as octets
\&  utf8::downgrade($string)
\&
\&  # or do your own error handling
\&  utf8::downgrade($string, 1) or die "string must be octets";
.Ve
.Sp
\&\fBNote that this function does not handle arbitrary encodings\fR;
use Encode instead.
.IP "\(bu" 4
\&\f(CW\*(C`utf8::encode($string)\*(C'\fR
.Sp
(Since Perl v5.8.0)
Converts in-place the character sequence to the corresponding octet
sequence in Perl's extended \s-1UTF\-8.\s0 That is, every (possibly wide) character
gets replaced with a sequence of one or more characters that represent the
individual \s-1UTF\-8\s0 bytes of the character.  The \s-1UTF8\s0 flag is turned off.
Returns nothing.
.Sp
.Vb 4
\& my $x = "\ex{100}"; # $x contains one character, with ord 0x100
\& utf8::encode($x);  # $x contains two characters, with ords (on
\&                    # ASCII platforms) 0xc4 and 0x80.  On EBCDIC
\&                    # 1047, this would instead be 0x8C and 0x41.
.Ve
.Sp
Similar to:
.Sp
.Vb 2
\&  use Encode;
\&  $x = Encode::encode("utf8", $x);
.Ve
.Sp
\&\fBNote that this function does not handle arbitrary encodings\fR;
use Encode instead.
.IP "\(bu" 4
\&\f(CW\*(C`$success = utf8::decode($string)\*(C'\fR
.Sp
(Since Perl v5.8.0)
Attempts to convert in-place the octet sequence encoded in Perl's extended
\&\s-1UTF\-8\s0 to the corresponding character sequence. That is, it replaces each
sequence of characters in the string whose ords represent a valid (extended)
\&\s-1UTF\-8\s0 byte sequence, with the corresponding single character.  The \s-1UTF\-8\s0 flag
is turned on only if the source string contains multiple-byte \s-1UTF\-8\s0
characters.  If \fI\f(CI$string\fI\fR is invalid as extended \s-1UTF\-8,\s0 returns false;
otherwise returns true.
.Sp
.Vb 6
\& my $x = "\exc4\ex80"; # $x contains two characters, with ords
\&                     # 0xc4 and 0x80
\& utf8::decode($x);   # On ASCII platforms, $x contains one char,
\&                     # with ord 0x100.   Since these bytes aren\*(Aqt
\&                     # legal UTF\-EBCDIC, on EBCDIC platforms, $x is
\&                     # unchanged and the function returns FALSE.
.Ve
.Sp
\&\fBNote that this function does not handle arbitrary encodings\fR;
use Encode instead.
.IP "\(bu" 4
\&\f(CW\*(C`$unicode = utf8::native_to_unicode($code_point)\*(C'\fR
.Sp
(Since Perl v5.8.0)
This takes an unsigned integer (which represents the ordinal number of a
character (or a code point) on the platform the program is being run on) and
returns its Unicode equivalent value.  Since \s-1ASCII\s0 platforms natively use the
Unicode code points, this function returns its input on them.  On \s-1EBCDIC\s0
platforms it converts from \s-1EBCDIC\s0 to Unicode.
.Sp
A meaningless value will currently be returned if the input is not an unsigned
integer.
.Sp
Since Perl v5.22.0, calls to this function are optimized out on \s-1ASCII\s0
platforms, so there is no performance hit in using it there.
.IP "\(bu" 4
\&\f(CW\*(C`$native = utf8::unicode_to_native($code_point)\*(C'\fR
.Sp
(Since Perl v5.8.0)
This is the inverse of \f(CW\*(C`utf8::native_to_unicode()\*(C'\fR, converting the other
direction.  Again, on \s-1ASCII\s0 platforms, this returns its input, but on \s-1EBCDIC\s0
platforms it will find the native platform code point, given any Unicode one.
.Sp
A meaningless value will currently be returned if the input is not an unsigned
integer.
.Sp
Since Perl v5.22.0, calls to this function are optimized out on \s-1ASCII\s0
platforms, so there is no performance hit in using it there.
.IP "\(bu" 4
\&\f(CW\*(C`$flag = utf8::is_utf8($string)\*(C'\fR
.Sp
(Since Perl 5.8.1)  Test whether \fI\f(CI$string\fI\fR is marked internally as encoded in
\&\s-1UTF\-8.\s0  Functionally the same as \f(CW\*(C`Encode::is_utf8($string)\*(C'\fR.
.Sp
Typically only necessary for debugging and testing, if you need to
dump the internals of an \s-1SV,\s0 Devel::Peek's \fBDump()\fR
provides more detail in a compact form.
.Sp
If you still think you need this outside of debugging, testing or
dealing with filenames, you should probably read perlunitut and
\&\*(L"What is \*(R"the \s-1UTF8\s0 flag\*(L"?\*(R" in perlunifaq.
.Sp
Don't use this flag as a marker to distinguish character and binary
data: that should be decided for each variable when you write your
code.
.Sp
To force unicode semantics in code portable to perl 5.8 and 5.10, call
\&\f(CW\*(C`utf8::upgrade($string)\*(C'\fR unconditionally.
.IP "\(bu" 4
\&\f(CW\*(C`$flag = utf8::valid($string)\*(C'\fR
.Sp
[\s-1INTERNAL\s0] Test whether \fI\f(CI$string\fI\fR is in a consistent state regarding
\&\s-1UTF\-8.\s0  Will return true if it is well-formed Perl extended \s-1UTF\-8\s0 and has the
\&\s-1UTF\-8\s0 flag
on \fBor\fR if \fI\f(CI$string\fI\fR is held as bytes (both these states are 'consistent').
The main reason for this routine is to allow Perl's test suite to check
that operations have left strings in a consistent state.
.PP
\&\f(CW\*(C`utf8::encode\*(C'\fR is like \f(CW\*(C`utf8::upgrade\*(C'\fR, but the \s-1UTF8\s0 flag is
cleared.  See perlunicode, and the C \s-1API\s0
functions \f(CW\*(C`sv_utf8_upgrade\*(C'\fR,
\&\f(CW\*(C`"sv_utf8_downgrade" in perlapi\*(C'\fR, \f(CW\*(C`"sv_utf8_encode" in perlapi\*(C'\fR,
and \f(CW\*(C`"sv_utf8_decode" in perlapi\*(C'\fR, which are wrapped by the Perl functions
\&\f(CW\*(C`utf8::upgrade\*(C'\fR, \f(CW\*(C`utf8::downgrade\*(C'\fR, \f(CW\*(C`utf8::encode\*(C'\fR and
\&\f(CW\*(C`utf8::decode\*(C'\fR.  Also, the functions \f(CW\*(C`utf8::is_utf8\*(C'\fR, \f(CW\*(C`utf8::valid\*(C'\fR,
\&\f(CW\*(C`utf8::encode\*(C'\fR, \f(CW\*(C`utf8::decode\*(C'\fR, \f(CW\*(C`utf8::upgrade\*(C'\fR, and \f(CW\*(C`utf8::downgrade\*(C'\fR are
actually internal, and thus always available, without a \f(CW\*(C`require utf8\*(C'\fR
statement.
.SH "BUGS"
.IX Header "BUGS"
Some filesystems may not support \s-1UTF\-8\s0 file names, or they may be supported
incompatibly with Perl.  Therefore \s-1UTF\-8\s0 names that are visible to the
filesystem, such as module names may not work.
.SH "SEE ALSO"
.IX Header "SEE ALSO"
perlunitut, perluniintro, perlrun, bytes, perlunicode