You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
hello-algo/en/chapter_data_structure/character_encoding/index.html

1936 lines
66 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="description" content="Data Structures and Algorithms Crash Course with Animated Illustrations and Off-the-Shelf Code">
<meta name="author" content="krahets">
<link rel="canonical" href="https://www.hello-algo.com/en/chapter_data_structure/character_encoding/">
<link rel="prev" href="../number_encoding/">
<link rel="next" href="../summary/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.5.3, mkdocs-material-9.5.5">
<title>3.4 Character Encoding * - Hello Algo</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.50c56a3b.min.css">
<link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<link rel="stylesheet" href="../../stylesheets/extra.css">
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
<link href="../../assets/stylesheets/glightbox.min.css" rel="stylesheet"/><style>
html.glightbox-open { overflow: initial; height: 100%; }
.gslide-title { margin-top: 0px; user-select: text; }
.gslide-desc { color: #666; user-select: text; }
.gslide-image img { background: white; }
.gscrollbar-fixer { padding-right: 15px; }
.gdesc-inner { font-size: 0.75rem; }
body[data-md-color-scheme="slate"] .gdesc-inner { background: var(--md-default-bg-color);}
body[data-md-color-scheme="slate"] .gslide-title { color: var(--md-default-fg-color);}
body[data-md-color-scheme="slate"] .gslide-desc { color: var(--md-default-fg-color);}
</style> <script src="../../assets/javascripts/glightbox.min.js"></script></head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="white" data-md-color-accent="teal">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#34-character-encoding" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
<aside class="md-banner">
<div class="md-banner__inner md-grid md-typeset">
<button class="md-banner__button md-icon" aria-label="Don't show this again">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
</button>
<div class="banner-svg">
<svg xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 512 512"><!--!Font Awesome Free 6.5.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free Copyright 2024 Fonticons, Inc.-->
<path
d="M480 32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9L381.7 53c-48 48-113.1 75-181 75H192 160 64c-35.3 0-64 28.7-64 64v96c0 35.3 28.7 64 64 64l0 128c0 17.7 14.3 32 32 32h64c17.7 0 32-14.3 32-32V352l8.7 0c67.9 0 133 27 181 75l43.6 43.6c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6V300.4c18.6-8.8 32-32.5 32-60.4s-13.4-51.6-32-60.4V32zm-64 76.7V240 371.3C357.2 317.8 280.5 288 200.7 288H192V192h8.7c79.8 0 156.5-29.8 215.3-83.3z" />
</svg>
<span>The paper book (Chinese edition) will be published soon. Stay tuned!</span>
</div>
</div>
<script>var content,el=document.querySelector("[data-md-component=announce]");el&&(content=el.querySelector(".md-typeset"),__md_hash(content.innerHTML)===__md_get("__announce")&&(el.hidden=!0))</script>
</aside>
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="Hello Algo" class="md-header__button md-logo" aria-label="Hello Algo" data-md-component="logo">
<img src="../../assets/images/logo.svg" alt="logo">
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
Hello Algo
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
3.4 Character Encoding *
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="white" data-md-color-accent="teal" aria-label="Dark mode" type="radio" name="__palette" id="__palette_0">
<label class="md-header__button md-icon" title="Dark mode" for="__palette_1" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M7.5 2c-1.79 1.15-3 3.18-3 5.5s1.21 4.35 3.03 5.5C4.46 13 2 10.54 2 7.5A5.5 5.5 0 0 1 7.5 2m11.57 1.5 1.43 1.43L4.93 20.5 3.5 19.07 19.07 3.5m-6.18 2.43L11.41 5 9.97 6l.42-1.7L9 3.24l1.75-.12.58-1.65L12 3.1l1.73.03-1.35 1.13.51 1.67m-3.3 3.61-1.16-.73-1.12.78.34-1.32-1.09-.83 1.36-.09.45-1.29.51 1.27 1.36.03-1.05.87.4 1.31M19 13.5a5.5 5.5 0 0 1-5.5 5.5c-1.22 0-2.35-.4-3.26-1.07l7.69-7.69c.67.91 1.07 2.04 1.07 3.26m-4.4 6.58 2.77-1.15-.24 3.35-2.53-2.2m4.33-2.7 1.15-2.77 2.2 2.54-3.35.23m1.15-4.96-1.14-2.78 3.34.24-2.2 2.54M9.63 18.93l2.77 1.15-2.53 2.19-.24-3.34Z"/></svg>
</label>
<input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="black" data-md-color-accent="teal" aria-label="Light mode" type="radio" name="__palette" id="__palette_1">
<label class="md-header__button md-icon" title="Light mode" for="__palette_0" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M7.5 2c-1.79 1.15-3 3.18-3 5.5s1.21 4.35 3.03 5.5C4.46 13 2 10.54 2 7.5A5.5 5.5 0 0 1 7.5 2m11.57 1.5 1.43 1.43L4.93 20.5 3.5 19.07 19.07 3.5m-6.18 2.43L11.41 5 9.97 6l.42-1.7L9 3.24l1.75-.12.58-1.65L12 3.1l1.73.03-1.35 1.13.51 1.67m-3.3 3.61-1.16-.73-1.12.78.34-1.32-1.09-.83 1.36-.09.45-1.29.51 1.27 1.36.03-1.05.87.4 1.31M19 13.5a5.5 5.5 0 0 1-5.5 5.5c-1.22 0-2.35-.4-3.26-1.07l7.69-7.69c.67.91 1.07 2.04 1.07 3.26m-4.4 6.58 2.77-1.15-.24 3.35-2.53-2.2m4.33-2.7 1.15-2.77 2.2 2.54-3.35.23m1.15-4.96-1.14-2.78 3.34.24-2.2 2.54M9.63 18.93l2.77 1.15-2.53 2.19-.24-3.34Z"/></svg>
</label>
</form>
<script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<div class="md-header__option">
<div class="md-select">
<button class="md-header__button md-icon" aria-label="Select language">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m12.87 15.07-2.54-2.51.03-.03A17.52 17.52 0 0 0 14.07 6H17V4h-7V2H8v2H1v2h11.17C11.5 7.92 10.44 9.75 9 11.35 8.07 10.32 7.3 9.19 6.69 8h-2c.73 1.63 1.73 3.17 2.98 4.56l-5.09 5.02L4 19l5-5 3.11 3.11.76-2.04M18.5 10h-2L12 22h2l1.12-3h4.75L21 22h2l-4.5-12m-2.62 7 1.62-4.33L19.12 17h-3.24Z"/></svg>
</button>
<div class="md-select__inner">
<ul class="md-select__list">
<li class="md-select__item">
<a href="/" hreflang="zh" class="md-select__link">
中文
</a>
</li>
<li class="md-select__item">
<a href="/en/" hreflang="en" class="md-select__link">
English
</a>
</li>
</ul>
</div>
</div>
</div>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
</label>
<nav class="md-search__options" aria-label="Search">
<a href="javascript:void(0)" class="md-search__icon md-icon" title="Share" aria-label="Share" data-clipboard data-clipboard-text="" data-md-component="search-share" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M18 16.08c-.76 0-1.44.3-1.96.77L8.91 12.7c.05-.23.09-.46.09-.7 0-.24-.04-.47-.09-.7l7.05-4.11c.54.5 1.25.81 2.04.81a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3c0 .24.04.47.09.7L8.04 9.81C7.5 9.31 6.79 9 6 9a3 3 0 0 0-3 3 3 3 0 0 0 3 3c.79 0 1.5-.31 2.04-.81l7.12 4.15c-.05.21-.08.43-.08.66 0 1.61 1.31 2.91 2.92 2.91 1.61 0 2.92-1.3 2.92-2.91A2.92 2.92 0 0 0 18 16.08Z"/></svg>
</a>
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
</button>
</nav>
<div class="md-search__suggest" data-md-component="search-suggest"></div>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/krahets/hello-algo" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512"><!--! Font Awesome Free 6.5.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg>
</div>
<div class="md-source__repository">
krahets/hello-algo
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="Hello Algo" class="md-nav__button md-logo" aria-label="Hello Algo" data-md-component="logo">
<img src="../../assets/images/logo.svg" alt="logo">
</a>
Hello Algo
</label>
<div class="md-nav__source">
<a href="https://github.com/krahets/hello-algo" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512"><!--! Font Awesome Free 6.5.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg>
</div>
<div class="md-source__repository">
krahets/hello-algo
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1" >
<div class="md-nav__link md-nav__container">
<a href="../../chapter_preface/" class="md-nav__link ">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 4H3a2 2 0 0 0-2 2v13a2 2 0 0 0 2 2h18a2 2 0 0 0 2-2V6a2 2 0 0 0-2-2M3 19V6h8v13H3m18 0h-8V6h8v13m-7-9.5h6V11h-6V9.5m0 2.5h6v1.5h-6V12m0 2.5h6V16h-6v-1.5Z"/></svg>
<span class="md-ellipsis">
Chapter 0. Preface
</span>
</a>
<label class="md-nav__link " for="__nav_1" id="__nav_1_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_1_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_1">
<span class="md-nav__icon md-icon"></span>
Chapter 0. Preface
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter_preface/about_the_book/" class="md-nav__link">
<span class="md-ellipsis">
0.1 About This Book
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_preface/suggestions/" class="md-nav__link">
<span class="md-ellipsis">
0.2 How to Read
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_preface/summary/" class="md-nav__link">
<span class="md-ellipsis">
0.3 Summary
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<div class="md-nav__link md-nav__container">
<a href="../../chapter_introduction/" class="md-nav__link ">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2m0 16H5V5h14v14M6.2 7.7h5v1.5h-5V7.7m6.8 8.1h5v1.5h-5v-1.5m0-2.6h5v1.5h-5v-1.5M8 18h1.5v-2h2v-1.5h-2v-2H8v2H6V16h2v2m6.1-7.1 1.4-1.4 1.4 1.4 1.1-1-1.4-1.4L18 7.1 16.9 6l-1.4 1.4L14.1 6 13 7.1l1.4 1.4L13 9.9l1.1 1Z"/></svg>
<span class="md-ellipsis">
Chapter 1. Introduction to Algorithms
</span>
</a>
<label class="md-nav__link " for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Chapter 1. Introduction to Algorithms
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter_introduction/algorithms_are_everywhere/" class="md-nav__link">
<span class="md-ellipsis">
1.1 Algorithms are Everywhere
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_introduction/what_is_dsa/" class="md-nav__link">
<span class="md-ellipsis">
1.2 What is an Algorithm
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_introduction/summary/" class="md-nav__link">
<span class="md-ellipsis">
1.3 Summary
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<div class="md-nav__link md-nav__container">
<a href="../../chapter_computational_complexity/" class="md-nav__link ">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M6 2h12v6l-4 4 4 4v6H6v-6l4-4-4-4V2m10 14.5-4-4-4 4V20h8v-3.5m-4-5 4-4V4H8v3.5l4 4M10 6h4v.75l-2 2-2-2V6Z"/></svg>
<span class="md-ellipsis">
Chapter 2. Complexity Analysis
</span>
</a>
<label class="md-nav__link " for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Chapter 2. Complexity Analysis
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter_computational_complexity/performance_evaluation/" class="md-nav__link">
<span class="md-ellipsis">
2.1 Algorithm Efficiency Assessment
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_computational_complexity/iteration_and_recursion/" class="md-nav__link">
<span class="md-ellipsis">
2.2 Iteration and Recursion
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_computational_complexity/time_complexity/" class="md-nav__link">
<span class="md-ellipsis">
2.3 Time Complexity
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_computational_complexity/space_complexity/" class="md-nav__link">
<span class="md-ellipsis">
2.4 Space Complexity
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_computational_complexity/summary/" class="md-nav__link">
<span class="md-ellipsis">
2.5 Summary
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" checked>
<div class="md-nav__link md-nav__container">
<a href="../" class="md-nav__link ">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M11 13.5v8H3v-8h8m-2 2H5v4h4v-4M12 2l5.5 9h-11L12 2m0 3.86L10.08 9h3.84L12 5.86M17.5 13c2.5 0 4.5 2 4.5 4.5S20 22 17.5 22 13 20 13 17.5s2-4.5 4.5-4.5m0 2a2.5 2.5 0 0 0-2.5 2.5 2.5 2.5 0 0 0 2.5 2.5 2.5 2.5 0 0 0 2.5-2.5 2.5 2.5 0 0 0-2.5-2.5Z"/></svg>
<span class="md-ellipsis">
Chapter 3. Data Structures
</span>
</a>
<label class="md-nav__link " for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Chapter 3. Data Structures
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../classification_of_data_structure/" class="md-nav__link">
<span class="md-ellipsis">
3.1 Classification of Data Structures
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../basic_data_types/" class="md-nav__link">
<span class="md-ellipsis">
3.2 Fundamental Data Types
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../number_encoding/" class="md-nav__link">
<span class="md-ellipsis">
3.3 Number Encoding *
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
3.4 Character Encoding *
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
3.4 Character Encoding *
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#341-ascii-character-set" class="md-nav__link">
<span class="md-ellipsis">
3.4.1 &nbsp; ASCII Character Set
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#342-gbk-character-set" class="md-nav__link">
<span class="md-ellipsis">
3.4.2 &nbsp; GBK Character Set
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#343-unicode-character-set" class="md-nav__link">
<span class="md-ellipsis">
3.4.3 &nbsp; Unicode Character Set
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#344-utf-8-encoding" class="md-nav__link">
<span class="md-ellipsis">
3.4.4 &nbsp; UTF-8 Encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#345-character-encoding-in-programming-languages" class="md-nav__link">
<span class="md-ellipsis">
3.4.5 &nbsp; Character Encoding in Programming Languages
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../summary/" class="md-nav__link">
<span class="md-ellipsis">
3.5 Summary
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5" >
<div class="md-nav__link md-nav__container">
<a href="../../chapter_array_and_linkedlist/" class="md-nav__link ">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 5v14h17V5H3m4 2v2H5V7h2m-2 6v-2h2v2H5m0 2h2v2H5v-2m13 2H9v-2h9v2m0-4H9v-2h9v2m0-4H9V7h9v2Z"/></svg>
<span class="md-ellipsis">
Chapter 4. Array and Linked List
</span>
</a>
<label class="md-nav__link " for="__nav_5" id="__nav_5_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5">
<span class="md-nav__icon md-icon"></span>
Chapter 4. Array and Linked List
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter_array_and_linkedlist/array/" class="md-nav__link">
<span class="md-ellipsis">
4.1 Array
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_array_and_linkedlist/linked_list/" class="md-nav__link">
<span class="md-ellipsis">
4.2 Linked List
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_array_and_linkedlist/list/" class="md-nav__link">
<span class="md-ellipsis">
4.3 List
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_array_and_linkedlist/ram_and_cache/" class="md-nav__link">
<span class="md-ellipsis">
4.4 Memory and Cache
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_array_and_linkedlist/summary/" class="md-nav__link">
<span class="md-ellipsis">
4.5 Summary
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_6" >
<div class="md-nav__link md-nav__container">
<a href="../../chapter_stack_and_queue/" class="md-nav__link ">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17.36 20.2v-5.38h1.79V22H3v-7.18h1.8v5.38h12.56M6.77 14.32l.37-1.76 8.79 1.85-.37 1.76-8.79-1.85m1.16-4.21.76-1.61 8.14 3.78-.76 1.62-8.14-3.79m2.26-3.99 1.15-1.38 6.9 5.76-1.15 1.37-6.9-5.75m4.45-4.25L20 9.08l-1.44 1.07-5.36-7.21 1.44-1.07M6.59 18.41v-1.8h8.98v1.8H6.59Z"/></svg>
<span class="md-ellipsis">
Chapter 5. Stack and Queue
</span>
</a>
<label class="md-nav__link " for="__nav_6" id="__nav_6_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_6">
<span class="md-nav__icon md-icon"></span>
Chapter 5. Stack and Queue
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter_stack_and_queue/stack/" class="md-nav__link">
<span class="md-ellipsis">
5.1 Stack
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_stack_and_queue/queue/" class="md-nav__link">
<span class="md-ellipsis">
5.2 Queue
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_stack_and_queue/deque/" class="md-nav__link">
<span class="md-ellipsis">
5.3 Double-ended Queue
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_stack_and_queue/summary/" class="md-nav__link">
<span class="md-ellipsis">
5.4 Summary
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
<div class="md-nav__link md-nav__container">
<a href="../../chapter_hashing/" class="md-nav__link ">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19.3 17.89c1.32-2.1.7-4.89-1.41-6.21a4.52 4.52 0 0 0-6.21 1.41C10.36 15.2 11 18 13.09 19.3c1.47.92 3.33.92 4.8 0L21 22.39 22.39 21l-3.09-3.11m-2-.62c-.98.98-2.56.97-3.54 0-.97-.98-.97-2.56.01-3.54.97-.97 2.55-.97 3.53 0 .96.99.95 2.57-.03 3.54h.03M19 4H5a2 2 0 0 0-2 2v12a2 2 0 0 0 2 2h5.81a6.3 6.3 0 0 1-1.31-2H5v-4h4.18c.16-.71.43-1.39.82-2H5V8h6v2.81a6.3 6.3 0 0 1 2-1.31V8h6v2a6.499 6.499 0 0 1 2 2V6a2 2 0 0 0-2-2Z"/></svg>
<span class="md-ellipsis">
Chapter 6. Hash Table
</span>
</a>
<label class="md-nav__link " for="__nav_7" id="__nav_7_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_7">
<span class="md-nav__icon md-icon"></span>
Chapter 6. Hash Table
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter_hashing/hash_map/" class="md-nav__link">
<span class="md-ellipsis">
6.1 Hash Table
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_hashing/hash_collision/" class="md-nav__link">
<span class="md-ellipsis">
6.2 Hash Collision
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_hashing/hash_algorithm/" class="md-nav__link">
<span class="md-ellipsis">
6.3 Hash Algorithm
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter_hashing/summary/" class="md-nav__link">
<span class="md-ellipsis">
6.4 Summary
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#341-ascii-character-set" class="md-nav__link">
<span class="md-ellipsis">
3.4.1 &nbsp; ASCII Character Set
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#342-gbk-character-set" class="md-nav__link">
<span class="md-ellipsis">
3.4.2 &nbsp; GBK Character Set
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#343-unicode-character-set" class="md-nav__link">
<span class="md-ellipsis">
3.4.3 &nbsp; Unicode Character Set
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#344-utf-8-encoding" class="md-nav__link">
<span class="md-ellipsis">
3.4.4 &nbsp; UTF-8 Encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#345-character-encoding-in-programming-languages" class="md-nav__link">
<span class="md-ellipsis">
3.4.5 &nbsp; Character Encoding in Programming Languages
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<!-- Tags -->
<!-- Actions -->
<!-- Actions -->
<!-- Edit button -->
<a
href="https://github.com/krahets/hello-algo/tree/main/docs-en/chapter_data_structure/character_encoding.md"
title="Edit this page"
class="md-content__button md-icon"
>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.5.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M441 58.9 453.1 71c9.4 9.4 9.4 24.6 0 33.9L424 134.1 377.9 88 407 58.9c9.4-9.4 24.6-9.4 33.9 0zM209.8 256.2 344 121.9l46.1 46.1-134.3 134.2c-2.9 2.9-6.5 5-10.4 6.1L186.9 325l16.7-58.5c1.1-3.9 3.2-7.5 6.1-10.4zM373.1 25 175.8 222.2c-8.7 8.7-15 19.4-18.3 31.1l-28.6 100c-2.4 8.4-.1 17.4 6.1 23.6s15.2 8.5 23.6 6.1l100-28.6c11.8-3.4 22.5-9.7 31.1-18.3L487 138.9c28.1-28.1 28.1-73.7 0-101.8L474.9 25c-28.1-28.1-73.7-28.1-101.8 0zM88 64c-48.6 0-88 39.4-88 88v272c0 48.6 39.4 88 88 88h272c48.6 0 88-39.4 88-88V312c0-13.3-10.7-24-24-24s-24 10.7-24 24v112c0 22.1-17.9 40-40 40H88c-22.1 0-40-17.9-40-40V152c0-22.1 17.9-40 40-40h112c13.3 0 24-10.7 24-24s-10.7-24-24-24H88z"/></svg>
</a>
<!-- View button -->
<!-- Page content -->
<h1 id="34-character-encoding">3.4 &nbsp; Character Encoding *<a class="headerlink" href="#34-character-encoding" title="Permanent link">&para;</a></h1>
<p>In the computer system, all data is stored in binary form, and characters (represented by char) are no exception. To represent characters, we need to develop a "character set" that defines a one-to-one mapping between each character and binary numbers. With the character set, computers can convert binary numbers to characters by looking up the table.</p>
<h2 id="341-ascii-character-set">3.4.1 &nbsp; ASCII Character Set<a class="headerlink" href="#341-ascii-character-set" title="Permanent link">&para;</a></h2>
<p>The "ASCII code" is one of the earliest character sets, officially known as the American Standard Code for Information Interchange. It uses 7 binary digits (the lower 7 bits of a byte) to represent a character, allowing for a maximum of 128 different characters. As shown in the Figure 3-6 , ASCII includes uppercase and lowercase English letters, numbers 0 ~ 9, various punctuation marks, and certain control characters (such as newline and tab).</p>
<p><a class="glightbox" href="../character_encoding.assets/ascii_table.png" data-type="image" data-width="100%" data-height="auto" data-desc-position="bottom"><img alt="ASCII Code" class="animation-figure" src="../character_encoding.assets/ascii_table.png" /></a></p>
<p align="center"> Figure 3-6 &nbsp; ASCII Code </p>
<p>However, <strong>ASCII can only represent English characters</strong>. With the globalization of computers, a character set called "EASCII" was developed to represent more languages. It expands from the 7-bit structure of ASCII to 8 bits, enabling the representation of 256 characters.</p>
<p>Globally, various region-specific EASCII character sets have been introduced. The first 128 characters of these sets are consistent with the ASCII, while the remaining 128 characters are defined differently to accommodate the requirements of different languages.</p>
<h2 id="342-gbk-character-set">3.4.2 &nbsp; GBK Character Set<a class="headerlink" href="#342-gbk-character-set" title="Permanent link">&para;</a></h2>
<p>Later, it was found that <strong>EASCII still could not meet the character requirements of many languages</strong>. For instance, there are nearly a hundred thousand Chinese characters, with several thousand used regularly. In 1980, the Standardization Administration of China released the "GB2312" character set, which included 6763 Chinese characters, essentially fulfilling the computer processing needs for the Chinese language.</p>
<p>However, GB2312 could not handle some rare and traditional characters. The "GBK" character set expands GB2312 and includes 21886 Chinese characters. In the GBK encoding scheme, ASCII characters are represented with one byte, while Chinese characters use two bytes.</p>
<h2 id="343-unicode-character-set">3.4.3 &nbsp; Unicode Character Set<a class="headerlink" href="#343-unicode-character-set" title="Permanent link">&para;</a></h2>
<p>With the rapid evolution of computer technology and a plethora of character sets and encoding standards, numerous problems arose. On the one hand, these character sets generally only defined characters for specific languages and could not function properly in multilingual environments. On the other hand, the existence of multiple character set standards for the same language caused garbled text when information was exchanged between computers using different encoding standards.</p>
<p>Researchers of that era thought: <strong>What if a comprehensive character set encompassing all global languages and symbols was developed? Wouldn't this resolve the issues associated with cross-linguistic environments and garbled text?</strong> Inspired by this idea, the extensive character set, Unicode, was born.</p>
<p>"Unicode" is referred to as "统一码" (Unified Code) in Chinese, theoretically capable of accommodating over a million characters. It aims to incorporate characters from all over the world into a single set, providing a universal character set for processing and displaying various languages and reducing the issues of garbled text due to different encoding standards.</p>
<p>Since its release in 1991, Unicode has continually expanded to include new languages and characters. As of September 2022, Unicode contains 149,186 characters, including characters, symbols, and even emojis from various languages. In the vast Unicode character set, commonly used characters occupy 2 bytes, while some rare characters may occupy 3 or even 4 bytes.</p>
<p>Unicode is a universal character set that assigns a number (called a "code point") to each character, <strong>but it does not specify how these character code points should be stored in a computer system</strong>. One might ask: How does a system interpret Unicode code points of varying lengths within a text? For example, given a 2-byte code, how does the system determine if it represents a single 2-byte character or two 1-byte characters?</p>
<p>A straightforward solution to this problem is to store all characters as equal-length encodings. As shown in the Figure 3-7 , each character in "Hello" occupies 1 byte, while each character in "算法" (algorithm) occupies 2 bytes. We could encode all characters in "Hello 算法" as 2 bytes by padding the higher bits with zeros. This method would enable the system to interpret a character every 2 bytes, recovering the content of the phrase.</p>
<p><a class="glightbox" href="../character_encoding.assets/unicode_hello_algo.png" data-type="image" data-width="100%" data-height="auto" data-desc-position="bottom"><img alt="Unicode Encoding Example" class="animation-figure" src="../character_encoding.assets/unicode_hello_algo.png" /></a></p>
<p align="center"> Figure 3-7 &nbsp; Unicode Encoding Example </p>
<p>However, as ASCII has shown us, encoding English only requires 1 byte. Using the above approach would double the space occupied by English text compared to ASCII encoding, which is a waste of memory space. Therefore, a more efficient Unicode encoding method is needed.</p>
<h2 id="344-utf-8-encoding">3.4.4 &nbsp; UTF-8 Encoding<a class="headerlink" href="#344-utf-8-encoding" title="Permanent link">&para;</a></h2>
<p>Currently, UTF-8 has become the most widely used Unicode encoding method internationally. <strong>It is a variable-length encoding</strong>, using 1 to 4 bytes to represent a character, depending on the complexity of the character. ASCII characters need only 1 byte, Latin and Greek letters require 2 bytes, commonly used Chinese characters need 3 bytes, and some other rare characters need 4 bytes.</p>
<p>The encoding rules for UTF-8 are not complex and can be divided into two cases:</p>
<ul>
<li>For 1-byte characters, set the highest bit to <span class="arithmatex">\(0\)</span>, and the remaining 7 bits to the Unicode code point. Notably, ASCII characters occupy the first 128 code points in the Unicode set. This means that <strong>UTF-8 encoding is backward compatible with ASCII</strong>. This implies that UTF-8 can be used to parse ancient ASCII text.</li>
<li>For characters of length <span class="arithmatex">\(n\)</span> bytes (where <span class="arithmatex">\(n &gt; 1\)</span>), set the highest <span class="arithmatex">\(n\)</span> bits of the first byte to <span class="arithmatex">\(1\)</span>, and the <span class="arithmatex">\((n + 1)^{\text{th}}\)</span> bit to <span class="arithmatex">\(0\)</span>; starting from the second byte, set the highest 2 bits of each byte to <span class="arithmatex">\(10\)</span>; the rest of the bits are used to fill the Unicode code point.</li>
</ul>
<p>The Figure 3-8 shows the UTF-8 encoding for "Hello算法". It can be observed that since the highest <span class="arithmatex">\(n\)</span> bits are set to <span class="arithmatex">\(1\)</span>, the system can determine the length of the character as <span class="arithmatex">\(n\)</span> by counting the number of highest bits set to <span class="arithmatex">\(1\)</span>.</p>
<p>But why set the highest 2 bits of the remaining bytes to <span class="arithmatex">\(10\)</span>? Actually, this <span class="arithmatex">\(10\)</span> serves as a kind of checksum. If the system starts parsing text from an incorrect byte, the <span class="arithmatex">\(10\)</span> at the beginning of the byte can help the system quickly detect anomalies.</p>
<p>The reason for using <span class="arithmatex">\(10\)</span> as a checksum is that, under UTF-8 encoding rules, it's impossible for the highest two bits of a character to be <span class="arithmatex">\(10\)</span>. This can be proven by contradiction: If the highest two bits of a character are <span class="arithmatex">\(10\)</span>, it indicates that the character's length is <span class="arithmatex">\(1\)</span>, corresponding to ASCII. However, the highest bit of an ASCII character should be <span class="arithmatex">\(0\)</span>, which contradicts the assumption.</p>
<p><a class="glightbox" href="../character_encoding.assets/utf-8_hello_algo.png" data-type="image" data-width="100%" data-height="auto" data-desc-position="bottom"><img alt="UTF-8 Encoding Example" class="animation-figure" src="../character_encoding.assets/utf-8_hello_algo.png" /></a></p>
<p align="center"> Figure 3-8 &nbsp; UTF-8 Encoding Example </p>
<p>Apart from UTF-8, other common encoding methods include:</p>
<ul>
<li><strong>UTF-16 Encoding</strong>: Uses 2 or 4 bytes to represent a character. All ASCII characters and commonly used non-English characters are represented with 2 bytes; a few characters require 4 bytes. For 2-byte characters, the UTF-16 encoding equals the Unicode code point.</li>
<li><strong>UTF-32 Encoding</strong>: Every character uses 4 bytes. This means UTF-32 occupies more space than UTF-8 and UTF-16, especially for texts with a high proportion of ASCII characters.</li>
</ul>
<p>From the perspective of storage space, using UTF-8 to represent English characters is very efficient because it only requires 1 byte; using UTF-16 to encode some non-English characters (such as Chinese) can be more efficient because it only requires 2 bytes, while UTF-8 might need 3 bytes.</p>
<p>From a compatibility perspective, UTF-8 is the most versatile, with many tools and libraries supporting UTF-8 as a priority.</p>
<h2 id="345-character-encoding-in-programming-languages">3.4.5 &nbsp; Character Encoding in Programming Languages<a class="headerlink" href="#345-character-encoding-in-programming-languages" title="Permanent link">&para;</a></h2>
<p>Historically, many programming languages utilized fixed-length encodings such as UTF-16 or UTF-32 for processing strings during program execution. This allows strings to be handled as arrays, offering several advantages:</p>
<ul>
<li><strong>Random Access</strong>: Strings encoded in UTF-16 can be accessed randomly with ease. For UTF-8, which is a variable-length encoding, locating the <span class="arithmatex">\(i^{th}\)</span> character requires traversing the string from the start to the <span class="arithmatex">\(i^{th}\)</span> position, taking <span class="arithmatex">\(O(n)\)</span> time.</li>
<li><strong>Character Counting</strong>: Similar to random access, counting the number of characters in a UTF-16 encoded string is an <span class="arithmatex">\(O(1)\)</span> operation. However, counting characters in a UTF-8 encoded string requires traversing the entire string.</li>
<li><strong>String Operations</strong>: Many string operations like splitting, concatenating, inserting, and deleting are easier on UTF-16 encoded strings. These operations generally require additional computation on UTF-8 encoded strings to ensure the validity of the UTF-8 encoding.</li>
</ul>
<p>The design of character encoding schemes in programming languages is an interesting topic involving various factors:</p>
<ul>
<li>Javas <code>String</code> type uses UTF-16 encoding, with each character occupying 2 bytes. This was based on the initial belief that 16 bits were sufficient to represent all possible characters and proven incorrect later. As the Unicode standard expanded beyond 16 bits, characters in Java may now be represented by a pair of 16-bit values, known as “surrogate pairs.”</li>
<li>JavaScript and TypeScript use UTF-16 encoding for similar reasons as Java. When JavaScript was first introduced by Netscape in 1995, Unicode was still in its early stages, and 16-bit encoding was sufficient to represent all Unicode characters.</li>
<li>C# uses UTF-16 encoding, largely because the .NET platform, designed by Microsoft, and many Microsoft technologies, including the Windows operating system, extensively use UTF-16 encoding.</li>
</ul>
<p>Due to the underestimation of character counts, these languages had to use "surrogate pairs" to represent Unicode characters exceeding 16 bits. This approach has its drawbacks: strings containing surrogate pairs may have characters occupying 2 or 4 bytes, losing the advantage of fixed-length encoding. Additionally, handling surrogate pairs adds complexity and debugging difficulty to programming.</p>
<p>Addressing these challenges, some languages have adopted alternative encoding strategies:</p>
<ul>
<li>Pythons <code>str</code> type uses Unicode encoding with a flexible representation where the storage length of characters depends on the largest Unicode code point in the string. If all characters are ASCII, each character occupies 1 byte, 2 bytes for characters within the Basic Multilingual Plane (BMP), and 4 bytes for characters beyond the BMP.</li>
<li>Gos <code>string</code> type internally uses UTF-8 encoding. Go also provides the <code>rune</code> type for representing individual Unicode code points.</li>
<li>Rusts <code>str</code> and <code>String</code> types use UTF-8 encoding internally. Rust also offers the <code>char</code> type for individual Unicode code points.</li>
</ul>
<p>Its important to note that the above discussion pertains to how strings are stored in programming languages, <strong>which is different from how strings are stored in files or transmitted over networks</strong>. For file storage or network transmission, strings are usually encoded in UTF-8 format for optimal compatibility and space efficiency.</p>
<!-- Source file information -->
<!-- Was this page helpful? -->
<!-- Previous and next pages link -->
<nav
class="md-footer__inner md-grid"
aria-label="Footer"
>
<!-- Link to previous page -->
<a
href="../number_encoding/"
class="md-footer__link md-footer__link--prev"
aria-label="Previous: 3.3 Number Encoding *"
rel="prev"
>
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
</div>
<div class="md-footer__title">
<span class="md-footer__direction">
Previous
</span>
<div class="md-ellipsis">
3.3 Number Encoding *
</div>
</div>
</a>
<!-- Link to next page -->
<a
href="../summary/"
class="md-footer__link md-footer__link--next"
aria-label="Next: 3.5 Summary"
rel="next"
>
<div class="md-footer__title">
<span class="md-footer__direction">
Next
</span>
<div class="md-ellipsis">
3.5 Summary
</div>
</div>
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4Z"/></svg>
</div>
</a>
</nav>
<!-- Comment system -->
<h5 align="center" id="__comments">Feel free to drop your insights, questions or suggestions</h5>
<!-- Insert generated snippet here -->
<script
src="https://giscus.app/client.js"
data-repo="krahets/hello-algo"
data-repo-id="R_kgDOIXtSqw"
data-category="Announcements"
data-category-id="DIC_kwDOIXtSq84CSZk_"
data-mapping="pathname"
data-strict="1"
data-reactions-enabled="1"
data-emit-metadata="0"
data-input-position="top"
data-theme="light"
data-lang="en"
crossorigin="anonymous"
async
>
</script>
<!-- Synchronize Giscus theme with palette -->
<script>
var giscus = document.querySelector("script[src*=giscus]")
/* Set palette on initial load */
var palette = __md_get("__palette")
if (palette && typeof palette.color === "object") {
var theme = palette.color.scheme === "slate" ? "dark_dimmed" : "light"
giscus.setAttribute("data-theme", theme)
}
/* Register event handlers after documented loaded */
document.addEventListener("DOMContentLoaded", function() {
var ref = document.querySelector("[data-md-component=palette]")
ref.addEventListener("change", function() {
var palette = __md_get("__palette")
if (palette && typeof palette.color === "object") {
var theme = palette.color.scheme === "slate" ? "dark_dimmed" : "light"
/* Instruct Giscus to change theme */
var frame = document.querySelector(".giscus-frame")
frame.contentWindow.postMessage(
{ giscus: { setConfig: { theme } } },
"https://giscus.app"
)
}
})
})
</script>
</article>
</div>
<script>var tabs=__md_get("__tabs");if(Array.isArray(tabs))e:for(var set of document.querySelectorAll(".tabbed-set")){var tab,labels=set.querySelector(".tabbed-labels");for(tab of tabs)for(var label of labels.getElementsByTagName("label"))if(label.innerText.trim()===tab){var input=document.getElementById(label.htmlFor);input.checked=!0;continue e}}</script>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
<button type="button" class="md-top md-icon" data-md-component="top" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12Z"/></svg>
Back to top
</button>
</main>
<footer class="md-footer">
<nav class="md-footer__inner md-grid" aria-label="Footer" >
<a href="../number_encoding/" class="md-footer__link md-footer__link--prev" aria-label="Previous: 3.3 Number Encoding *">
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
</div>
<div class="md-footer__title">
<span class="md-footer__direction">
Previous
</span>
<div class="md-ellipsis">
3.3 Number Encoding *
</div>
</div>
</a>
<a href="../summary/" class="md-footer__link md-footer__link--next" aria-label="Next: 3.5 Summary">
<div class="md-footer__title">
<span class="md-footer__direction">
Next
</span>
<div class="md-ellipsis">
3.5 Summary
</div>
</div>
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4Z"/></svg>
</div>
</a>
</nav>
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
<div class="md-copyright__highlight">
Copyright &copy; 2022-2024 krahets</br>The website content is licensed under <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/">CC BY-NC-SA 4.0</a>
</div>
</div>
<div class="md-social">
<a href="https://github.com/krahets" target="_blank" rel="noopener" title="github.com" class="md-social__link">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512"><!--! Font Awesome Free 6.5.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg>
</a>
<a href="https://twitter.com/krahets" target="_blank" rel="noopener" title="twitter.com" class="md-social__link">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.5.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M459.37 151.716c.325 4.548.325 9.097.325 13.645 0 138.72-105.583 298.558-298.558 298.558-59.452 0-114.68-17.219-161.137-47.106 8.447.974 16.568 1.299 25.34 1.299 49.055 0 94.213-16.568 130.274-44.832-46.132-.975-84.792-31.188-98.112-72.772 6.498.974 12.995 1.624 19.818 1.624 9.421 0 18.843-1.3 27.614-3.573-48.081-9.747-84.143-51.98-84.143-102.985v-1.299c13.969 7.797 30.214 12.67 47.431 13.319-28.264-18.843-46.781-51.005-46.781-87.391 0-19.492 5.197-37.36 14.294-52.954 51.655 63.675 129.3 105.258 216.365 109.807-1.624-7.797-2.599-15.918-2.599-24.04 0-57.828 46.782-104.934 104.934-104.934 30.213 0 57.502 12.67 76.67 33.137 23.715-4.548 46.456-13.32 66.599-25.34-7.798 24.366-24.366 44.833-46.132 57.827 21.117-2.273 41.584-8.122 60.426-16.243-14.292 20.791-32.161 39.308-52.628 54.253z"/></svg>
</a>
<a href="https://leetcode.cn/u/jyd/" target="_blank" rel="noopener" title="leetcode.cn" class="md-social__link">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.5.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M392.8 1.2c-17-4.9-34.7 5-39.6 22l-128 448c-4.9 17 5 34.7 22 39.6s34.7-5 39.6-22l128-448c4.9-17-5-34.7-22-39.6zm80.6 120.1c-12.5 12.5-12.5 32.8 0 45.3l89.3 89.4-89.4 89.4c-12.5 12.5-12.5 32.8 0 45.3s32.8 12.5 45.3 0l112-112c12.5-12.5 12.5-32.8 0-45.3l-112-112c-12.5-12.5-32.8-12.5-45.3 0zm-306.7 0c-12.5-12.5-32.8-12.5-45.3 0l-112 112c-12.5 12.5-12.5 32.8 0 45.3l112 112c12.5 12.5 32.8 12.5 45.3 0s12.5-32.8 0-45.3L77.3 256l89.4-89.4c12.5-12.5 12.5-32.8 0-45.3z"/></svg>
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"base": "../..", "features": ["announce.dismiss", "content.action.edit", "content.code.annotate", "content.code.copy", "content.tabs.link", "content.tooltips", "navigation.indexes", "navigation.top", "navigation.footer", "navigation.tracking", "search.highlight", "search.share", "search.suggest", "toc.follow"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
<script src="../../assets/javascripts/bundle.c18c5fb9.min.js"></script>
<script src="../../javascripts/mathjax.js"></script>
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script>document$.subscribe(() => {const lightbox = GLightbox({"touchNavigation": true, "loop": false, "zoomable": true, "draggable": false, "openEffect": "zoom", "closeEffect": "zoom", "slideEffect": "none"});})</script></body>
</html>