Skip to main content
Glama
test_french_accents.py4.56 kB
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Test script to verify French accented character handling in OCR text recognition. This script tests that French words with accented characters (é, è, à, ç, etc.) and contractions (n'êtes, l'été) are properly grouped as single words and not split at each accented character. """ import sys import os import numpy as np # Add the project root to the path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode def test_french_word_grouping(): """Test that French words with accents are properly grouped.""" # Initialize the decoder decoder = BaseRecLabelDecode(character_dict_path=None, use_space_char=True) # Test cases with French accented words test_cases = [ { "name": "Simple accented word: été (summer)", "text": "été", "expected_words": [["é", "t", "é"]], "expected_states": ["en&num"], }, { "name": "Word with ç: français (French)", "text": "français", "expected_words": [["f", "r", "a", "n", "ç", "a", "i", "s"]], "expected_states": ["en&num"], }, { "name": "Contraction: n'êtes (you are)", "text": "n'êtes", "expected_words": [["n", "'", "ê", "t", "e", "s"]], "expected_states": ["en&num"], }, { "name": "Multiple accents: élève (student)", "text": "élève", "expected_words": [["é", "l", "è", "v", "e"]], "expected_states": ["en&num"], }, { "name": "Word with à: à demain (see you tomorrow)", "text": "à demain", "expected_words": [["à"], ["d", "e", "m", "a", "i", "n"]], "expected_states": ["en&num", "en&num"], }, { "name": "Complex: C'était très français (It was very French)", "text": "C'était très français", "expected_words": [ ["C", "'", "é", "t", "a", "i", "t"], ["t", "r", "è", "s"], ["f", "r", "a", "n", "ç", "a", "i", "s"], ], "expected_states": ["en&num", "en&num", "en&num"], }, ] print("=" * 70) print("Testing French Accented Character Word Grouping") print("=" * 70) all_passed = True for test in test_cases: text = test["name"] test_text = test["text"] # Create a mock selection array (all characters are valid) selection = np.ones(len(test_text), dtype=bool) # Call get_word_info word_list, word_col_list, state_list = decoder.get_word_info( test_text, selection ) # Check results passed = True if len(word_list) != len(test["expected_words"]): passed = False print(f"\nFAILED: {text}") print( f" Expected {len(test['expected_words'])} words, got {len(word_list)}" ) elif state_list != test["expected_states"]: passed = False print(f"\nFAILED: {text}") print(f" Expected states: {test['expected_states']}") print(f" Got states: {state_list}") else: # Check if words match for i, (expected, actual) in enumerate( zip(test["expected_words"], word_list) ): if expected != actual: passed = False print(f"\nFAILED: {text}") print(f" Word {i}: Expected {expected}, got {actual}") break if passed: print(f"\nPASSED: {text}") print(f" Text: '{test_text}'") print(f" Words: {[''.join(w) for w in word_list]}") print(f" States: {state_list}") else: all_passed = False print(f" Text: '{test_text}'") print(f" Expected words: {[''.join(w) for w in test['expected_words']]}") print(f" Got words: {[''.join(w) for w in word_list]}") print("\n" + "=" * 70) if all_passed: print("All tests PASSED! French accented words are properly grouped.") else: print("Some tests FAILED. Please review the output above.") print("=" * 70) return all_passed if __name__ == "__main__": success = test_french_word_grouping() sys.exit(0 if success else 1)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PaddlePaddle/PaddleOCR'

If you have feedback or need assistance with the MCP directory API, please join our Discord server