> ## Documentation Index
> Fetch the complete documentation index at: https://docs.poly.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Tutorial: Polishing the voice experience

> PolyAcademy Level 3 – Master filler, turn-taking, voice quality, and personalization.

export const LessonMeta = ({level, difficulty, time}) => {
  const levelConfig = {
    1: {
      badge: 'bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200',
      label: 'Level 1'
    },
    2: {
      badge: 'bg-amber-100 text-amber-800 dark:bg-amber-900 dark:text-amber-200',
      label: 'Level 2'
    },
    3: {
      badge: 'bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-200',
      label: 'Level 3'
    }
  };
  const difficultyConfig = {
    Beginner: 'bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200',
    Intermediate: 'bg-amber-100 text-amber-800 dark:bg-amber-900 dark:text-amber-200',
    Advanced: 'bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-200'
  };
  const lvl = levelConfig[level] || levelConfig[1];
  const diffColor = difficultyConfig[difficulty] || difficultyConfig['Beginner'];
  return <div className="flex flex-wrap items-center gap-2 my-4 not-prose">
      <span className={`inline-flex items-center rounded-full px-2.5 py-0.5 text-xs font-semibold ${lvl.badge}`}>
        {lvl.label}
      </span>
      <span className={`inline-flex items-center rounded-full px-2.5 py-0.5 text-xs font-semibold ${diffColor}`}>
        {difficulty}
      </span>
      {time && <span className="inline-flex items-center gap-1 text-xs text-gray-500 dark:text-gray-400">
          <svg className="w-3.5 h-3.5" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={2}>
            <path strokeLinecap="round" strokeLinejoin="round" d="M12 6v6h4.5m4.5 0a9 9 0 11-18 0 9 9 0 0118 0z" />
          </svg>
          {time}
        </span>}
    </div>;
};

export const ProgressTracker = ({lessonNum, totalLessons, level}) => {
  const [checked, setChecked] = useState(false);
  return <div onClick={() => setChecked(prev => !prev)} className={checked ? 'flex items-center gap-3 p-4 rounded-lg border-2 border-green-600 bg-green-50 dark:bg-green-950 cursor-pointer select-none transition-all' : 'flex items-center gap-3 p-4 rounded-lg border-2 border-gray-200 dark:border-gray-600 bg-gray-50 dark:bg-gray-800 cursor-pointer select-none transition-all'}>
      <div className={checked ? 'w-5 h-5 rounded border-2 border-green-600 bg-green-600 flex items-center justify-center shrink-0 transition-all' : 'w-5 h-5 rounded border-2 border-gray-400 dark:border-gray-500 bg-white dark:bg-gray-800 flex items-center justify-center shrink-0 transition-all'}>
        {checked ? <svg width="10" height="8" viewBox="0 0 10 8" fill="none">
            <path d="M1 4L3.5 6.5L9 1" stroke="white" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round" />
          </svg> : null}
      </div>
      <div>
        <div className={checked ? 'font-semibold text-sm text-green-700 dark:text-green-300' : 'font-semibold text-sm text-gray-700 dark:text-gray-200'}>
          {checked ? 'Lesson complete' : 'Mark lesson complete'}
        </div>
        {lessonNum && totalLessons ? <div className="text-xs text-gray-500 dark:text-gray-400 mt-0.5">
            {level ? level + ' - ' : ''}Lesson {lessonNum} of {totalLessons}
          </div> : null}
      </div>
    </div>;
};

export const Quiz = ({questions = []}) => {
  const [selected, setSelected] = useState({});
  const [resetCount, setResetCount] = useState(0);
  const letters = ['A', 'B', 'C', 'D'];
  const handleSelect = (qIdx, optIdx) => {
    if (selected[qIdx] !== undefined) return;
    setSelected(prev => ({
      ...prev,
      [qIdx]: optIdx
    }));
  };
  const handleReset = () => {
    setSelected({});
    setResetCount(c => c + 1);
  };
  if (!questions?.length) return null;
  const getOptionClasses = ({hasAnswered, isThisCorrect, isThisSelected}) => {
    if (!hasAnswered) {
      return {
        btn: 'flex w-full items-center gap-3 py-2.5 px-4 rounded-xl text-sm leading-normal transition-all duration-150 text-left border cursor-pointer border-gray-200 bg-white text-gray-700 hover:border-gray-300 hover:bg-gray-50 hover:shadow-sm dark:border-gray-600 dark:bg-gray-800 dark:text-gray-200 dark:hover:border-gray-500 dark:hover:bg-gray-700',
        badge: 'w-6 h-6 rounded-full text-xs font-bold flex items-center justify-center shrink-0 leading-none transition-all duration-150 bg-gray-100 text-gray-500 dark:bg-gray-700 dark:text-gray-300',
        icon: null
      };
    }
    if (isThisCorrect) {
      return {
        btn: 'flex w-full items-center gap-3 py-2.5 px-4 rounded-xl text-sm leading-normal transition-all duration-150 text-left border cursor-default border-green-400 bg-green-50 text-green-900 font-medium dark:border-green-500 dark:bg-green-950 dark:text-green-100',
        badge: 'w-6 h-6 rounded-full text-xs font-bold flex items-center justify-center shrink-0 leading-none transition-all duration-150 bg-green-500 text-white dark:bg-green-500',
        icon: <svg className="shrink-0 w-4 h-4 text-green-500 dark:text-green-400 ml-auto" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={2.5}>
            <path strokeLinecap="round" strokeLinejoin="round" d="M4.5 12.75l6 6 9-13.5" />
          </svg>
      };
    }
    if (isThisSelected) {
      return {
        btn: 'flex w-full items-center gap-3 py-2.5 px-4 rounded-xl text-sm leading-normal transition-all duration-150 text-left border cursor-default border-red-400 bg-red-50 text-red-900 dark:border-red-500 dark:bg-red-950 dark:text-red-100',
        badge: 'w-6 h-6 rounded-full text-xs font-bold flex items-center justify-center shrink-0 leading-none transition-all duration-150 bg-red-500 text-white dark:bg-red-500',
        icon: <svg className="shrink-0 w-4 h-4 text-red-400 dark:text-red-400 ml-auto" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={2.5}>
            <path strokeLinecap="round" strokeLinejoin="round" d="M6 18L18 6M6 6l12 12" />
          </svg>
      };
    }
    return {
      btn: 'flex w-full items-center gap-3 py-2.5 px-4 rounded-xl text-sm leading-normal transition-all duration-150 text-left border cursor-default border-gray-100 bg-white text-gray-400 dark:border-gray-700 dark:bg-gray-800 dark:text-gray-500',
      badge: 'w-6 h-6 rounded-full text-xs font-bold flex items-center justify-center shrink-0 leading-none transition-all duration-150 bg-gray-100 text-gray-500 dark:bg-gray-700 dark:text-gray-500',
      icon: null
    };
  };
  return <div key={resetCount} className="my-6">
      {questions.map((q, qIdx) => {
    const answer = selected[qIdx];
    const hasAnswered = answer !== undefined;
    const isCorrect = answer === q.correct;
    return <div key={String(qIdx)} className="mb-8">
            <p className="flex items-start gap-2.5 font-semibold text-sm mb-3 mt-0 leading-relaxed text-gray-900 dark:text-gray-100">
              <span className="inline-flex items-center justify-center w-5 h-5 rounded-full bg-gray-800 dark:bg-gray-200 text-white dark:text-gray-900 text-xs font-bold shrink-0 mt-px leading-none">
                {qIdx + 1}
              </span>
              {q.q}
            </p>

            <div className="flex flex-col gap-2">
              {q.options.map((opt, i) => {
      const isThisCorrect = i === q.correct;
      const isThisSelected = i === answer;
      const {btn, badge, icon} = getOptionClasses({
        hasAnswered,
        isThisCorrect,
        isThisSelected
      });
      return <button key={String(i)} type="button" onClick={() => handleSelect(qIdx, i)} className={btn}>
                    <span className={badge}>{letters[i]}</span>
                    <span className="flex-1">{opt}</span>
                    {icon}
                  </button>;
    })}
            </div>

            {hasAnswered ? <div className={`mt-3 py-3 pl-4 pr-3.5 rounded-r-xl text-sm leading-relaxed border-l-4 ${isCorrect ? 'border-green-500 bg-green-50 dark:bg-green-950 dark:border-green-500' : 'border-red-500 bg-red-50 dark:bg-red-950 dark:border-red-500'}`}>
                <span className={`font-semibold ${isCorrect ? '!text-green-800 dark:!text-green-200' : '!text-red-800 dark:!text-red-200'}`}>
                  {isCorrect ? 'Correct.' : 'Not quite.'}
                </span>{' '}
                <span className="!text-gray-700 dark:!text-gray-300">{q.explanation}</span>
              </div> : null}
          </div>;
  })}

      <button type="button" onClick={handleReset} className="mt-1 text-xs text-gray-400 hover:text-gray-600 dark:hover:text-gray-300 underline underline-offset-2 cursor-pointer transition-colors duration-150">
        Reset quiz
      </button>
    </div>;
};

**Level 3 – Lesson 5 of 5** – Go beyond usability to create voice experiences that sound genuinely good.

<LessonMeta level={3} difficulty="Advanced" time="15 min" />

After building an agent that works and is easy to use, the final layer is polish: selecting voices that perform well in practice, adding natural filler and hesitation, managing turn-taking, and personalizing based on user context.

## The layers of a good voice experience

<Steps>
  <Step title="It works">
    Speech recognition transcribes correctly, APIs respond, the task can be completed.
  </Step>

  <Step title="It's easy to use">
    The interaction is efficient, intuitive, and follows the [design principles](/learn/guides/design-principles).
  </Step>

  <Step title="It sounds good">
    Copywriting, voice quality, turn-taking, and personalization make the experience enjoyable. This is the focus of this lesson.
  </Step>
</Steps>

## Voice selection and quality

Pick a voice that sounds good in practice, not just in samples. If you need to regenerate 50 times to find one good take, that voice won't produce consistent quality in a live deployment.

**After selecting a voice:**

* Listen to the most common things the agent says: greeting, "how can I help", "anything else", and the main flow prompts
* The LLM often generates similar phrasing for repeated scenarios – these get cached, so make sure they sound good
* Regenerate cached audio until it sounds right

Written copy always looks more informal than it sounds. Don't let written reviews make you over-formalise. When in doubt, build a short audio prototype and listen back – text on a page always sounds more formal than it does when spoken aloud.

## Natural filler and hesitation

Real humans pause, say "um", and hesitate – especially when they're thinking. Adding small amounts of this to agent speech makes it sound more natural. In linguistics this is called **disfluency**, and it includes filled pauses ("um", "uh"), slight repetitions, and drawn-out sounds.

### When to use it

| Context                  | What to add                 | Example                                                        |
| ------------------------ | --------------------------- | -------------------------------------------------------------- |
| API call / lookup        | Filler phrase               | "Um, let me just have a look at what space we have..."         |
| Complex instructions     | Slight hesitation           | "So what you'll want to do is, uh, go to settings and then..." |
| After a misunderstanding | Drawn-out sound, regrouping | "Hmm, what was it I can do for you?"                           |

### Why it works

* **During API calls**: filler sounds like someone checking another screen – it matches what the user expects is happening
* **After misunderstandings**: hesitation sounds like someone regrouping after a miscommunication, which is exactly what's happening
* **In general**: small pauses signal that the agent is "thinking", which makes silence less awkward

<Warning>
  Keep it subtle. Too much filler makes the agent sound confused rather than natural. Use it situationally, not on every turn.
</Warning>

## Turn-taking

Turn-taking – how the agent and user take turns speaking – is one of the most impactful aspects of voice experience, and one of the hardest to control at the project level.

Three common problems:

* **Too much latency** – the agent takes too long to respond after the user finishes speaking. Users disengage.
* **Interruptions** – the agent starts speaking before the user has finished. Users get frustrated.
* **No barge-in** – the user cannot interrupt the agent, even when the agent is saying something wrong or irrelevant.

Many turn-taking issues need platform-level improvements rather than project-level fixes. If you encounter persistent turn-taking problems, document specific examples and contact support.

### What you can control

* **Response length** – shorter responses reduce the chance of the agent and user talking over each other
* **Interaction style settings** – adjust latency thresholds in [audio management](/learn/guides/advanced/audio-management)
* **Barge-in configuration** – enable or disable based on the interaction type
* **Front-load key information** – put the important part first, so even if the user interrupts, they've heard what matters

## Personalisation

Personalisation uses information about the user to tailor the experience. It works at three levels:

### From the current conversation

If the user gives their name, you can use it – but not on every turn. LLMs tend to overuse names, which sounds scripted. Use sparingly for warmth.

### From API data

If you can see a user's recent activity, use it to shortcut the conversation:

> "I can see you just canceled a flight. Is that what you're calling about?"

This proves competence immediately and shortens the interaction.

### From previous calls

If the user called before and was sent an SMS for self-service, and they're calling back:

> "I see you were calling about this earlier. Was that text not working for you?"

This kind of continuity across calls signals attentiveness and builds user confidence in the system.

<Warning>
  Personalisation can feel intrusive if overdone. Use it when it clearly helps the user reach their goal faster. Avoid making users feel surveilled.
</Warning>

<Quiz
  questions={[
{
q: "The agent is about to make an API call that takes 2-3 seconds. What's the best way to handle the silence?",
options: [
  "Play hold music",
  "Say nothing – users are accustomed to brief pauses",
  "Use a filler phrase: 'Um, let me just look that up for you...'",
  "Return a hard-coded message explaining what's happening technically",
],
correct: 2,
explanation: "A filler phrase sounds like someone checking another screen – it matches what the user expects and masks the waiting time naturally. Silence or technical explanations break the conversational flow.",
}
]}
/>

## Matching the user's style

People naturally adjust how they speak depending on who they're talking to. In voice agents, this happens partially through the LLM (which adjusts vocabulary and formality based on user input).

For now, focus on:

* **Word choice** – if the user uses informal language, the agent should match
* **Pacing** – if the user speaks slowly, don't rush them with rapid-fire responses
* **Formality** – match the user's level of formality

## Try it yourself

<Steps>
  <Step title="Challenge: Design the experience around an API lookup">
    A user asks to track their order. The flow collects the tracking number and then makes an API call that takes 2-3 seconds.

    Design:

    1. What does the agent say while the API call runs?
    2. How do you handle a successful lookup?
    3. How do you handle a failed lookup?

    For each, consider: filler, tone, brevity, and what information to say first.

    <Accordion title="Example solution">
      **During API call:**

      > "Okay, let me just pull that up for you..."
      > (Subtle filler – sounds like checking a screen)

      **Successful lookup:**

      > "Got it – your order's been shipped and should arrive Thursday. Want me to send you the tracking link?"
      > (Brief, key info first, natural offer for follow-up)

      **Failed lookup:**

      > "Hmm, I'm not finding anything for that number. Could you double-check it and try again?"
      > (Hesitation signals regrouping, blames the number not the user)
    </Accordion>
  </Step>
</Steps>

<Quiz
  questions={[
{
q: "You're reviewing a live call and notice the agent uses the caller's name in every single response. What should you do?",
options: [
  "Nothing – personalization is always good",
  "Remove name usage entirely",
  "Reduce frequency – use the name occasionally for warmth, not on every turn",
  "Only use the name at the start and end of the call",
],
correct: 2,
explanation: "LLMs tend to overuse names, which sounds scripted and unnatural. Use names sparingly for genuine warmth, not as a mechanical pattern on every turn.",
}
]}
/>

<CardGroup cols={2}>
  <Card title="← Previous: Writing agent speech" icon="arrow-left" href="/learn/guides/expert/utterance-design">
    Lesson 4 of 5
  </Card>

  <Card title="Level 3 complete →" icon="trophy" href="/learn/guides/expert/finished">
    Recap and next steps
  </Card>
</CardGroup>

<ProgressTracker lessonKey="l3-5-voice-polish" lessonNum={5} totalLessons={5} level="Level 3" />
