GET /datasets/?format=api
HTTP 200 OK
Allow: GET, POST, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

[
    {
        "id": 15,
        "title": "IndicVoices",
        "area": "ASR",
        "published_on": "2024-06-01",
        "conference": "ACL",
        "description": "IndicVoices is a dataset of natural and spontaneous speech containing a total of 12000 hours of read (8%), extempore (76%) and conversational (15%) audio from 22563 speakers covering 208 Indian districts and 22 languages. Of these 12000 hours, 3200 hours have already been transcribed, with a median of 122 hours per language.",
        "paper_link": "https://arxiv.org/abs/2403.01926",
        "website_link": "https://huggingface.co/datasets/ai4bharat/IndicVoices",
        "github_link": "https://github.com/AI4Bharat/IndicVoices",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/IndicVoices",
        "paper_award": null,
        "license": []
    },
    {
        "id": 16,
        "title": "Lahaja",
        "area": "ASR",
        "published_on": "2024-06-01",
        "conference": "INTERSPEECH",
        "description": "Lahaja is a benchmark featuring 12.5 hours of Hindi audio to facilitate a comprehensive assessment of Hindi ASR systems across various accents. This dataset includes read and spontaneous speech on diverse topics, collected from 132 speakers across 83 districts in India.",
        "paper_link": "https://arxiv.org/abs/2408.11440",
        "website_link": "https://huggingface.co/datasets/ai4bharat/Lahaja",
        "github_link": "https://github.com/AI4Bharat/Lahaja",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/Lahaja",
        "paper_award": null,
        "license": []
    },
    {
        "id": 17,
        "title": "Shrutilipi",
        "area": "ASR",
        "published_on": "2022-08-26",
        "conference": "ICASSP",
        "description": "Shrutilipi is a labelled ASR corpus obtained by mining parallel audio and text pairs at the document scale from All India Radio news bulletins for 12 Indian languages: Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Sanskrit, Tamil, Telugu, Urdu. The corpus has over 6400 hours of data across all languages.",
        "paper_link": "https://arxiv.org/abs/2208.12666",
        "website_link": "https://huggingface.co/datasets/ai4bharat/Shrutilipi",
        "github_link": null,
        "hf_link": "https://huggingface.co/datasets/ai4bharat/Shrutilipi",
        "paper_award": null,
        "license": []
    },
    {
        "id": 19,
        "title": "Svarah",
        "area": "ASR",
        "published_on": "2023-08-01",
        "conference": "INTERSPEECH",
        "description": "Svarah is a benchmark addressing gaps in ASR performance on Indian accents, featuring 9.6 hours of transcribed English audio from 117 speakers across 65 locations in India. It includes both read and spontaneous speech across various domains, ensuring diverse vocabulary.",
        "paper_link": "https://arxiv.org/abs/2305.15760",
        "website_link": "https://huggingface.co/datasets/ai4bharat/Svarah",
        "github_link": "https://github.com/AI4Bharat/Svarah",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/Svarah",
        "paper_award": null,
        "license": []
    },
    {
        "id": 20,
        "title": "Kathbath",
        "area": "ASR",
        "published_on": "2023-02-01",
        "conference": "AAAI",
        "description": "Kathbath is a comprehensive dataset comprising 1,684 hours of labeled speech data collected from 1,218 contributors across 203 districts in India, spanning across 12 Indian languages.",
        "paper_link": "https://arxiv.org/abs/2208.11761",
        "website_link": "https://huggingface.co/datasets/ai4bharat/kathbath",
        "github_link": "https://github.com/AI4Bharat/IndicSUPERB",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/kathbath",
        "paper_award": null,
        "license": []
    },
    {
        "id": 22,
        "title": "Dhwani",
        "area": "ASR",
        "published_on": "2022-02-01",
        "conference": "AAAI",
        "description": "Dhwani is a unlabelled audio dataset consisting of 17,000 hours of raw speech data for 40 Indian languages from a wide variety of domains including education, news, technology, and finance",
        "paper_link": "https://arxiv.org/abs/2111.03945",
        "website_link": "https://github.com/AI4Bharat/IndicWav2Vec/tree/main/data_prep_scripts/urls",
        "github_link": "https://github.com/AI4Bharat/IndicWav2Vec/tree/main/data_prep_scripts/urls",
        "hf_link": "https://huggingface.co/ai4bharat",
        "paper_award": null,
        "license": []
    },
    {
        "id": 23,
        "title": "Vistaar",
        "area": "ASR",
        "published_on": "2023-05-24",
        "conference": "INTERSPEECH",
        "description": "Vistaar is a set of 59 benchmarks and training datasets across various language and domain combinations such as news, education, literature, tourism etc. The training datasets are avaialable for 12 Indian languages amounting to over 10,700 hours of labelled audio data. We also train IndicWhisper models by fine-tuning the Whisper models on the Vistaar train dataset and observe that it has the lowest WER on 39 out of 59 Vistaar benchmarks.",
        "paper_link": "https://arxiv.org/abs/2305.15386",
        "website_link": "https://github.com/AI4Bharat/vistaar",
        "github_link": "https://github.com/AI4Bharat/vistaar",
        "hf_link": null,
        "paper_award": null,
        "license": []
    },
    {
        "id": 33,
        "title": "MahaDhwani",
        "area": "ASR",
        "published_on": "2025-04-06",
        "conference": "ICASSP",
        "description": "MahaDhwani, a corpus comprising 279K hours of raw audio across 22 Indian languages.",
        "paper_link": "https://ieeexplore.ieee.org/document/10888018/",
        "website_link": "https://github.com/AI4Bharat/MahaDhwani",
        "github_link": "https://github.com/AI4Bharat/MahaDhwani",
        "hf_link": null,
        "paper_award": null,
        "license": []
    },
    {
        "id": 40,
        "title": "Nirantar",
        "area": "ASR",
        "published_on": "2025-05-19",
        "conference": "INTERSPEECH",
        "description": "Nirantar is a comprehensive speech dataset containing 3,240 hours of human-transcribed conversational speech across 22 Indian languages and 208 districts. This large-scale collection effort captures extempore and conversational speech from diverse locations throughout India, with 1,780 hours of newly released data.",
        "paper_link": "https://arxiv.org/pdf/2507.00534",
        "website_link": "https://github.com/AI4Bharat/Nirantar",
        "github_link": "https://github.com/AI4Bharat/Nirantar",
        "hf_link": "https://github.com/AI4Bharat/Nirantar",
        "paper_award": null,
        "license": []
    },
    {
        "id": 43,
        "title": "SRUTI",
        "area": "ASR",
        "published_on": "2025-05-19",
        "conference": "INTERSPEECH",
        "description": "Sruti is a benchmark dataset created by AI4Bharat for evaluating Automatic Speech Recognition (ASR) systems specifically for rural Bhojpuri women.",
        "paper_link": "https://arxiv.org/abs/2506.09653",
        "website_link": "https://huggingface.co/datasets/ai4bharat/Rural_Women_Bhojpuri",
        "github_link": "https://github.com/AI4Bharat/Sruti?tab=readme-ov-file",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/Rural_Women_Bhojpuri",
        "paper_award": null,
        "license": []
    },
    {
        "id": 1,
        "title": "Sangraha",
        "area": "LLM",
        "published_on": "2024-03-11",
        "conference": "ACL",
        "description": "Sangraha is the largest high-quality, cleaned Indic language pretraining data containing 251B tokens summed up over 22 languages, extracted from curated sources, existing multilingual corpora and large scale translations.",
        "paper_link": "https://aclanthology.org/2024.acl-long.843.pdf",
        "website_link": "https://huggingface.co/datasets/ai4bharat/sangraha/tree/main",
        "github_link": "https://github.com/AI4Bharat/IndicLLMSuite",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/sangraha/tree/main",
        "paper_award": "๐Ÿ† ACL 2024 Outstanding Paper Award ๐Ÿ†",
        "license": [
            {
                "id": 1,
                "license_name": "cc-by-4.0",
                "liense_url": "https://choosealicense.com/licenses/cc-by-4.0/"
            }
        ]
    },
    {
        "id": 5,
        "title": "Naamapadam",
        "area": "LLM",
        "published_on": "2022-12-04",
        "conference": "ACL",
        "description": "We present, Naamapadam, the largest publicly available Named Entity Recognition (NER) dataset for the 11 major Indian languages from two language families. The dataset contains more than 400k sentences annotated with a total of at least 100k entities from three standard entity categories (Person, Location, and, Organization) for 9 out of the 11 languages. The training dataset has been automatically created from the Samanantar parallel corpus by projecting automatically tagged entities from an English sentence to the corresponding Indian language translation. We also create manually annotated testsets for 9 languages. We demonstrate the utility of the obtained dataset on the Naamapadam-test dataset. We also release IndicNER, a multilingual IndicBERT model fine-tuned on Naamapadam training set. IndicNER achieves an F1 score of more than 80 for 7 out of 9 test languages.",
        "paper_link": "https://arxiv.org/abs/2212.10168",
        "website_link": "https://huggingface.co/datasets/ai4bharat/naamapadam",
        "github_link": null,
        "hf_link": "https://huggingface.co/datasets/ai4bharat/naamapadam",
        "paper_award": null,
        "license": []
    },
    {
        "id": 6,
        "title": "IndicGLUE",
        "area": "LLM",
        "published_on": "2020-11-20",
        "conference": "EMNLP",
        "description": "To thoroughly evaluate language models on Indian languages, we need a robust NLU benchmark consisting of a wide variety of tasks and covering all the Indian languages. IndicGLUE is a natural language understanding benchmark that we propose. It consists of 6 tasks which we describe in the next section.\r\n\r\nIn addition, we also compile a list of additional evaluations which comprises of tasks based on publicly-available datasets.",
        "paper_link": "https://aclanthology.org/2020.findings-emnlp.445.pdf",
        "website_link": "https://huggingface.co/spaces/evaluate-metric/indic_glue",
        "github_link": "https://github.com/AI4Bharat/Indic-BERT-v1",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/indic_glue",
        "paper_award": null,
        "license": []
    },
    {
        "id": 7,
        "title": "IndicNLG",
        "area": "LLM",
        "published_on": "2022-03-22",
        "conference": "EMNLP",
        "description": "Natural Language Generation (NLG) for non-English languages is hampered by the scarcity of datasets in these languages. In this paper, we present the IndicNLG Benchmark, a collection of datasets for benchmarking NLG for 11 Indic languages. We focus on five diverse tasks, namely, biography generation using Wikipedia infoboxes, news headline generation, sentence summarization, paraphrase generation and, question generation. We describe the created datasets and use them to benchmark the performance of several monolingual and multilingual baselines that leverage pre-trained sequence-to-sequence models. Our results exhibit the strong performance of multilingual language-specific pre-trained models, and the utility of models trained on our dataset for other related NLG tasks. Our dataset creation methods can be easily applied to modest-resource languages as they involve simple steps such as scraping news articles and Wikipedia infoboxes, light cleaning, and pivoting through machine translation data. To the best of our knowledge, the IndicNLG Benchmark is the first NLG benchmark for Indic languages and the most diverse multilingual NLG dataset, with approximately 8M examples across 5 tasks and 11 languages.",
        "paper_link": "https://arxiv.org/abs/2203.05437",
        "website_link": "https://huggingface.co/collections/ai4bharat/indicnlg-66c5a1397bab135be074cfe1",
        "github_link": null,
        "hf_link": "https://huggingface.co/collections/ai4bharat/indicnlg-66c5a1397bab135be074cfe1",
        "paper_award": null,
        "license": []
    },
    {
        "id": 8,
        "title": "IndicXTREME",
        "area": "LLM",
        "published_on": "2022-12-11",
        "conference": "ACL",
        "description": "Building Natural Language Understanding (NLU) capabilities for Indic languages, which have a collective speaker base of more than one billion speakers is absolutely crucial. In this work, we aim to improve the NLU capabilities of Indic languages by making contributions along 3 important axes (i) monolingual corpora (ii) NLU testsets (iii) multilingual LLMs focusing on Indic languages. Specifically, we curate the largest monolingual corpora, IndicCorp, with 20.9B tokens covering 24 languages from 4 language families - a 2.3x increase over prior work, while supporting 12 additional languages. Next, we create a human-supervised benchmark, IndicXTREME, consisting of nine diverse NLU tasks covering 20 languages. Across languages and tasks, IndicXTREME contains a total of 105 evaluation sets, of which 52 are new contributions to the literature. To the best of our knowledge, this is the first effort towards creating a standard benchmark for Indic languages that aims to test the multilingual zero-shot capabilities of pretrained language models. Finally, we train IndicBERT v2, a state-of-the-art model supporting all the languages. Averaged across languages and tasks, the model achieves an absolute improvement of 2 points over a strong baseline.",
        "paper_link": "https://arxiv.org/abs/2212.05409",
        "website_link": "https://huggingface.co/collections/ai4bharat/indicxtreme",
        "github_link": null,
        "hf_link": "https://huggingface.co/collections/ai4bharat/indicxtreme",
        "paper_award": null,
        "license": []
    },
    {
        "id": 10,
        "title": "IndicAlign",
        "area": "LLM",
        "published_on": "2024-03-11",
        "conference": "ACL",
        "description": "IndicAlign Instruct is a diverse collection of 74.7M prompt-response pairs across 20 languages, gathered through four methods: aggregating existing IFT datasets, translating English datasets into 14 Indian languages, generating conversations from India-centric Wikipedia articles using open-source LLMs, and crowdsourcing prompts via the Anudesh platform.",
        "paper_link": "https://aclanthology.org/2024.acl-long.843.pdf",
        "website_link": "https://huggingface.co/datasets/ai4bharat/indic-align",
        "github_link": "https://github.com/AI4Bharat/IndicLLMSuite",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/indic-align",
        "paper_award": "๐Ÿ† ACL 2024 Outstanding Paper Award ๐Ÿ†",
        "license": []
    },
    {
        "id": 11,
        "title": "FBI",
        "area": "LLM",
        "published_on": "2024-06-19",
        "conference": "EMNLP",
        "description": "FBI is a novel framework to evaluate the effectiveness of LLMs as evaluators for text generation tasks Our study reveals significant shortcomings in current Evaluator LLMs, as they fail to detect quality drops in over 50% of cases, raising concerns about their reliability in practical applications.",
        "paper_link": "https://arxiv.org/abs/2406.13439",
        "website_link": "https://huggingface.co/datasets/ai4bharat/FBI",
        "github_link": "https://github.com/AI4Bharat/FBI",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/FBI",
        "paper_award": null,
        "license": []
    },
    {
        "id": 12,
        "title": "IndicCorpV2",
        "area": "LLM",
        "published_on": "2022-12-11",
        "conference": "ACL",
        "description": "IndicCorp v2 is the largest monolingual corpora for Indian languages (at the time of release), with 20.9B tokens covering 24 languages from 4 language families",
        "paper_link": "https://arxiv.org/abs/2212.05409",
        "website_link": "https://github.com/AI4Bharat/IndicBERT/tree/main?tab=readme-ov-file#indiccorp-v2",
        "github_link": "https://github.com/AI4Bharat/IndicBERT/tree/main?tab=readme-ov-file#indiccorp-v2",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/IndicCorpV2",
        "paper_award": null,
        "license": []
    },
    {
        "id": 13,
        "title": "IndicCorp",
        "area": "LLM",
        "published_on": "2020-11-20",
        "conference": "ACL",
        "description": "In this paper, we introduce NLP resources for 11 major Indian languages from two major language families. These resources include: (a) large-scale sentence-level monolingual corpora, (b) pre-trained word embeddings, (c) pre-trained language models, and (d) multiple NLU evaluation datasets (IndicGLUE benchmark). The monolingual corpora contains a total of 8.8 billion tokens across all 11 languages and Indian English, primarily sourced from news crawls. The word embeddings are based on FastText, hence suitable for handling morphological complexity of Indian languages. The pre-trained language models are based on the compact ALBERT model. Lastly, we compile the (IndicGLUE benchmark for Indian language NLU. To this end, we create datasets for the following tasks: Article Genre Classification, Headline Prediction, Wikipedia Section-Title Prediction, Cloze-style Multiple choice QA, Winograd NLI and COPA. We also include publicly available datasets for some Indic languages for tasks like Named Entity Recognition, Cross-lingual Sentence Retrieval, Paraphrase detection, etc. Our embeddings are competitive or better than existing pre-trained embeddings on multiple tasks. We hope that the availability of the dataset will accelerate Indic NLP research which has the potential to impact more than a billion people. It can also help the community in evaluating advances in NLP over a more diverse pool of languages.",
        "paper_link": "https://aclanthology.org/2020.findings-emnlp.445/",
        "website_link": "https://github.com/AI4Bharat/Indic-BERT-v1",
        "github_link": "https://github.com/AI4Bharat/Indic-BERT-v1",
        "hf_link": null,
        "paper_award": null,
        "license": []
    },
    {
        "id": 26,
        "title": "IndicInstruct",
        "area": "LLM",
        "published_on": "2024-01-18",
        "conference": "ARXIV",
        "description": "A collection of different instruction datasets spanning English and Hindi languages. The collection consists of:\r\n\r\nAnudesh, \r\nwikiHow, \r\nFlan v2 (67k sample subset), \r\nDolly, \r\nAnthropic-HHH (5k sample subset), \r\nOpenAssistant v1, \r\nLymSys-Chat (50k sample subset).\r\n\r\nWe translate the English subset of specific datasets using IndicTrans2 (Gala et al., 2023).",
        "paper_link": "https://arxiv.org/abs/2401.15006",
        "website_link": "https://huggingface.co/datasets/ai4bharat/indic-instruct-data-v0.1",
        "github_link": "https://github.com/AI4Bharat/IndicInstruct",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/indic-instruct-data-v0.1",
        "paper_award": null,
        "license": []
    },
    {
        "id": 30,
        "title": "Recon",
        "area": "LLM",
        "published_on": "2024-10-17",
        "conference": "ACL",
        "description": "We introduce RECON, a human-annotated, generalpurpose multilingual evaluation benchmark. The\r\ninput prompts are fully human-generated with multiple levels of supervision. This benchmark serves two key purposes: \r\n\r\n(i) to assess the multilingual capabilities of LLMs and (ii) to meta-evaluate the performance of Evaluator LLMs.",
        "paper_link": "https://arxiv.org/abs/2410.13394",
        "website_link": "https://huggingface.co/collections/ai4bharat/cia-suite-66ea9a7e18a6c70bd8de27a1",
        "github_link": "https://github.com/AI4Bharat/CIA",
        "hf_link": "https://huggingface.co/collections/ai4bharat/cia-suite-66ea9a7e18a6c70bd8de27a1",
        "paper_award": null,
        "license": []
    },
    {
        "id": 31,
        "title": "MILU",
        "area": "LLM",
        "published_on": "2025-01-27",
        "conference": "NAACL",
        "description": "MILU (Multi-task Indic Language Understanding Benchmark) is a comprehensive MCQ based evaluation dataset designed to assess the performance of LLMs across 11 Indic languages and 8 diverse domains.",
        "paper_link": "https://aclanthology.org/2025.naacl-long.507/",
        "website_link": "https://huggingface.co/datasets/ai4bharat/MILU",
        "github_link": "https://github.com/AI4Bharat/MILU",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/MILU",
        "paper_award": null,
        "license": []
    },
    {
        "id": 39,
        "title": "Indic-Bias",
        "area": "LLM",
        "published_on": "2025-05-16",
        "conference": "ACL",
        "description": "Indic-Bias is a comprehensive benchmark to evaluate the fairness of LLMs across 85 Indian Identity groups, focusing on Bias and Stereotypes. We create three tasks - Plausibility, Judgment, and Generation, and evaluate 14 popular LLMs to identify allocative and representational harms.",
        "paper_link": "https://arxiv.org/abs/2506.23111",
        "website_link": "https://huggingface.co/datasets/ai4bharat/Indic-Bias",
        "github_link": "https://github.com/AI4Bharat/indic-bias",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/Indic-Bias",
        "paper_award": null,
        "license": []
    },
    {
        "id": 35,
        "title": "Varta",
        "area": "NLG",
        "published_on": "2023-07-09",
        "conference": "ACL",
        "description": "A large-scale multilingual dataset for headline generation in Indic languages. This dataset includes 41.8 million news articles in 14 different Indic languages (and English), which come from a variety of high-quality sources.",
        "paper_link": "https://aclanthology.org/2023.findings-acl.215.pdf",
        "website_link": null,
        "github_link": "https://github.com/rahular/varta",
        "hf_link": "https://huggingface.co/datasets/rahular/varta",
        "paper_award": null,
        "license": []
    },
    {
        "id": 2,
        "title": "BPCC",
        "area": "NMT",
        "published_on": "2023-05-23",
        "conference": "TMLR",
        "description": "Bharat Parallel Corpus Collection (BPCC) is a comprehensive and publicly available parallel corpus that includes both existing and new data for all 22 scheduled Indic languages. It is comprised of two parts: BPCC-Mined and BPCC-Human, totaling approximately 230 million bitext pairs. BPCC-Mined contains about 228 million pairs, with nearly 126 million pairs newly added as a part of this work. On the other hand, BPCC-Human consists of 2.2 million gold standard English-Indic pairs, with an additional 644K bitext pairs from English Wikipedia sentences (forming the BPCC-H-Wiki subset) and 139K sentences covering everyday use cases (forming the BPCC-H-Daily subset). It is worth highlighting that BPCC provides the first available datasets for 7 languages and significantly increases the available data for all languages covered.",
        "paper_link": "https://arxiv.org/abs/2305.16307",
        "website_link": "https://huggingface.co/datasets/ai4bharat/BPCC/",
        "github_link": "https://github.com/AI4Bharat/IndicTrans2",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/Rasa",
        "paper_award": null,
        "license": []
    },
    {
        "id": 4,
        "title": "Samanantar",
        "area": "NMT",
        "published_on": "2021-04-12",
        "conference": "TACL",
        "description": "We present Samanantar, the largest publicly available parallel corpora collection for Indic languages. The collection contains a total of 49.7 million sentence pairs between English and 11 Indic languages (from two language families). Specifically, we compile 12.4 million sentence pairs from existing, publicly-available parallel corpora, and additionally mine 37.4 million sentence pairs from the web, resulting in a 4x increase. We mine the parallel sentences from the web by combining many corpora, tools, and methods: (a) web-crawled monolingual corpora, (b) document OCR for extracting sentences from scanned documents, (c) multilingual representation models for aligning sentences, and (d) approximate nearest neighbor search for searching in a large collection of sentences. Human evaluation of samples from the newly mined corpora validate the high quality of the parallel sentences across 11 languages. Further, we extract 83.4 million sentence pairs between all 55 Indic language pairs from the English-centric parallel corpus using English as the pivot language. We trained multilingual NMT models spanning all these languages on Samanantar, which outperform existing models and baselines on publicly available benchmarks, such as FLORES, establishing the utility of Samanantar. Our data and models are available publicly at this https URL and we hope they will help advance research in NMT and multilingual NLP for Indic languages.",
        "paper_link": "https://arxiv.org/abs/2104.05596",
        "website_link": "https://huggingface.co/datasets/ai4bharat/samanantar",
        "github_link": "https://github.com/AI4Bharat/indicnlp.ai4bharat.org/blob/master/content/pages/samanantar.md",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/samanantar",
        "paper_award": null,
        "license": []
    },
    {
        "id": 24,
        "title": "IndicMT-Eval",
        "area": "NMT",
        "published_on": "2023-07-03",
        "conference": "ACL",
        "description": "IndicMT-Eval is a comprehensive MQM dataset featuring 7,000 detailed annotations across five Indian languages and seven MT systems, designed for evaluating machine translation metrics in Indian languages.",
        "paper_link": "https://arxiv.org/abs/2212.10180",
        "website_link": "https://huggingface.co/datasets/ai4bharat/IndicMTEval",
        "github_link": "https://github.com/AI4Bharat/IndicMT-Eval",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/IndicMTEval",
        "paper_award": null,
        "license": []
    },
    {
        "id": 34,
        "title": "MQM Dataset",
        "area": "NMT",
        "published_on": "2024-08-11",
        "conference": "ACL",
        "description": "MQM dataset for 4 low resource languages : Assamese, Maithili, Punjabi, and Kannada",
        "paper_link": "https://aclanthology.org/2024.acl-short.58/",
        "website_link": "https://huggingface.co/datasets/ai4bharat/IndicMTEval",
        "github_link": "https://github.com/AI4Bharat/IndicMT-Eval",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/IndicMTEval",
        "paper_award": null,
        "license": []
    },
    {
        "id": 46,
        "title": "Pralekha",
        "area": "NMT",
        "published_on": "2025-10-25",
        "conference": "AACL",
        "description": "Pralekha is a large-scale parallel document dataset spanning 11 Indic languages and English, comprising 3 million document pairs. It serves both as a benchmark for evaluating Cross-Lingual Document Alignment (CLDA) techniques and as a domain-specific parallel corpus for training document-level Machine Translation (MT) models for Indic languages.",
        "paper_link": "https://arxiv.org/abs/2411.19096",
        "website_link": "https://huggingface.co/datasets/ai4bharat/Pralekha",
        "github_link": "https://github.com/AI4Bharat/Pralekha",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/Pralekha",
        "paper_award": null,
        "license": []
    },
    {
        "id": 42,
        "title": "FERMAT",
        "area": "OCR",
        "published_on": "2025-05-16",
        "conference": "ACL",
        "description": "FERMAT is a comprehensive benchmark designed to evaluate how well Vision-Language Models (VLMs) can assess handwritten mathematical content from students. The benchmark contains over 2,200 handwritten math solutions from 609 problems covering grades 7-12, with intentionally introduced errors across four key categories.",
        "paper_link": "https://arxiv.org/abs/2501.07244",
        "website_link": null,
        "github_link": "https://github.com/AI4Bharat/FERMAT",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/FERMAT",
        "paper_award": null,
        "license": []
    },
    {
        "id": 45,
        "title": "IndicDLP",
        "area": "OCR",
        "published_on": "2025-09-16",
        "conference": "ICDAR",
        "description": "A large-scale foundational document layout dataset spanning 11 representative Indic languages alongside English and 12 common document domains. The dataset contains 119,806 manually annotated images with 42 physical and logical layout region classes, addressing gaps in scale, multilingual diversity, and annotation granularity for document layout parsing.โ€‹",
        "paper_link": "https://link.springer.com/chapter/10.1007/978-3-032-04614-7_2#citeas",
        "website_link": "https://indicdlp.github.io/",
        "github_link": "https://github.com/AI4Bharat/IndicDLP",
        "hf_link": "https://huggingface.co/IndicDLP/IndicDLP-model",
        "paper_award": "Best Student Paper Runner-up",
        "license": []
    },
    {
        "id": 36,
        "title": "SignCorpus",
        "area": "SLR",
        "published_on": "2022-05-11",
        "conference": "NeurIPS",
        "description": "A large pretraining dataset on sign languages comprising about 4.6K\r\nhours of signing data across 10 sign languages. SignCorpus is curated from\r\nsign language videos on the internet, filtered for data quality, and converted into\r\nsequences of pose keypoints thereby removing all personal identifiable information\r\n(PII)",
        "paper_link": "https://papers.nips.cc/paper_files/paper/2022/file/eb011fd258c763c44d8c6a0e9ce04f17-Paper-Datasets_and_Benchmarks.pdf",
        "website_link": "https://openhands.ai4bharat.org/en/latest/",
        "github_link": null,
        "hf_link": null,
        "paper_award": null,
        "license": []
    },
    {
        "id": 38,
        "title": "INCLUDE",
        "area": "SLR",
        "published_on": "2020-10-12",
        "conference": "ACM",
        "description": "An ISL dataset that contains 0.27 million frames across 4,287 videos over 263 word signs from 15 different word categories.",
        "paper_link": "https://dl.acm.org/doi/pdf/10.1145/3394171.3413528",
        "website_link": "https://sign-language.ai4bharat.org/",
        "github_link": "https://github.com/AI4Bharat/sign-language.ai4bharat.org/tree/master",
        "hf_link": null,
        "paper_award": null,
        "license": []
    },
    {
        "id": 32,
        "title": "BhasaAnuvaad",
        "area": "ST",
        "published_on": "2024-11-08",
        "conference": "ACL",
        "description": "BhasaAnuvaad, is the largest Indic-language Speech Translation dataset spanning over 44,400 hours of speech and 17M text segments for 13 of 22 scheduled Indian languages and English.",
        "paper_link": "https://arxiv.org/abs/2411.04699",
        "website_link": "https://huggingface.co/collections/ai4bharat/bhasaanuvaad-672b3790b6470eab68b1cb87",
        "github_link": "https://github.com/AI4Bharat/BhasaAnuvaad",
        "hf_link": "https://huggingface.co/collections/ai4bharat/bhasaanuvaad-672b3790b6470eab68b1cb87",
        "paper_award": null,
        "license": []
    },
    {
        "id": 14,
        "title": "Rasa",
        "area": "TTS",
        "published_on": "2024-07-19",
        "conference": "INTERSPEECH",
        "description": "We release Rasa, the first multilingual expressive TTS dataset for any Indian language, which contains 10 hours of neutral speech and 1-3 hours of expressive speech for each of the 6 Ekman emotions covering 3 languages: Assamese, Bengali, & Tamil. Our ablation studies reveal that just 1 hour of neutral and 30 minutes of expressive data can yield a Fair system as indicated by MUSHRA scores. Increasing neutral data to 10 hours, with minimal expressive data, significantly enhances expressiveness. This offers a practical recipe for resource-constrained languages, prioritizing easily obtainable neutral data alongside smaller amounts of expressive data. We show the importance of syllabically balanced data and pooling emotions to enhance expressiveness. We also highlight challenges in generating specific emotions, e.g., fear and surprise.",
        "paper_link": "https://arxiv.org/pdf/2407.14056",
        "website_link": "https://huggingface.co/datasets/ai4bharat/Rasa",
        "github_link": "https://github.com/AI4Bharat/Rasa",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/Rasa",
        "paper_award": null,
        "license": []
    },
    {
        "id": 28,
        "title": "IndicOOV",
        "area": "TTS",
        "published_on": "2024-07-21",
        "conference": "INTERSPEECH",
        "description": "Publicly available TTS datasets for low-resource languages like Hindi and Tamil typically contain 10-20 hours of data, leading to poor vocabulary coverage. This limitation becomes evident in downstream applications where domain-specific vocabulary coupled with frequent code-mixing with English, results in many OOV words. To highlight this problem, we create a benchmark containing OOV words from several real-world applications. Indeed, state-of-the-art Hindi and Tamil TTS systems perform poorly on this OOV benchmark, as indicated by intelligibility tests. To improve the modelโ€™s OOV performance, we propose a low-effort and economically viable strategy to obtain more training data. Specifically, we propose using volunteers as opposed to high quality voice artists to record words containing character bigrams unseen in the training data. We show that using such inexpensive data, the modelโ€™s performance improves on OOV words, while not affecting voice quality and in-domain performance.",
        "paper_link": "https://www.isca-archive.org/interspeech_2024/anand24_interspeech.pdf",
        "website_link": "https://github.com/AI4Bharat/IndicOOV",
        "github_link": "https://github.com/AI4Bharat/IndicOOV",
        "hf_link": null,
        "paper_award": null,
        "license": []
    },
    {
        "id": 29,
        "title": "IndicVoices-R",
        "area": "TTS",
        "published_on": "2024-09-09",
        "conference": "NeurIPS",
        "description": "Recent advancements in text-to-speech (TTS) synthesis show that large-scale models trained with extensive web data produce highly natural-sounding output. However, such data is scarce for Indian languages due to the lack of high-quality, manually subtitled data on platforms like LibriVox or YouTube. To address this gap, we enhance existing large-scale ASR datasets containing natural conversations collected in low-quality environments to generate high-quality TTS training data. Our pipeline leverages the cross-lingual generalization of denoising and speech enhancement models trained on English and applied to Indian languages. This results in IndicVoices-R (IV-R), the largest multilingual Indian TTS dataset derived from an ASR dataset, with 1,704 hours of high-quality speech from 10,496 speakers across 22 Indian languages. IV-R matches the quality of gold-standard TTS datasets like LJSpeech, LibriTTS, and IndicTTS. We also introduce the IV-R Benchmark, the first to assess zero-shot, few-shot, and many-shot speaker generalization capabilities of TTS models on Indian voices, ensuring diversity in age, gender, and style. We demonstrate that fine-tuning an English pre-trained model on a combined dataset of high-quality IndicTTS and our IV-R dataset results in better zero-shot speaker generalization compared to fine-tuning on the IndicTTS dataset alone. Further, our evaluation reveals limited zero-shot generalization for Indian voices in TTS models trained on prior datasets, which we improve by fine-tuning the model on our data containing diverse set of speakers across language families. We open-source all data for all 22 official Indian languages.",
        "paper_link": "https://proceedings.neurips.cc/paper_files/paper/2024/file/7dfcaf4512bbf2a807a783b90afb6c09-Paper-Datasets_and_Benchmarks_Track.pdf",
        "website_link": "https://huggingface.co/datasets/ai4bharat/indicvoices_r",
        "github_link": "https://github.com/AI4Bharat/IndicVoices-R",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/indicvoices_r",
        "paper_award": null,
        "license": []
    },
    {
        "id": 44,
        "title": "MANGO",
        "area": "TTS",
        "published_on": "2025-05-19",
        "conference": "TMLR",
        "description": "MANGO is a massive dataset containing 246,000 human ratings for text-to-speech evaluation, representing the first large-scale collection for Indian languages (Hindi and Tamil). Created from 492 human listeners, the dataset serves two key purposes: analyzing human preferences in TTS systems and developing automatic evaluation metrics that align with human judgment. Released alongside research addressing MUSHRA test limitations, MANGO provides crucial empirical data for advancing TTS evaluation methodologies and fills a significant gap in speech synthesis research for Indian languages.",
        "paper_link": "https://arxiv.org/abs/2411.12719",
        "website_link": "https://huggingface.co/datasets/ai4bharat/MANGO",
        "github_link": null,
        "hf_link": "https://huggingface.co/datasets/ai4bharat/MANGO",
        "paper_award": null,
        "license": []
    },
    {
        "id": 47,
        "title": "Saffron",
        "area": "TTS",
        "published_on": "2025-08-19",
        "conference": "INTERSPEECH",
        "description": "While subjective evaluations in recent years indicate rapid progress in TTS, can current TTS systems truly pass a human deception test in a Turing-like evaluation? We introduce Human Fooling Rate (HFR), a metric that directly measures how often machine-generated speech is mistaken for human. Our large-scale evaluation of open-source and commercial TTS models reveals critical insights: (i) CMOS-based claims of human parity often fail under deception testing, (ii) TTS progress should be benchmarked on datasets where human speech achieves high HFRs, as evaluating against monotonous or less expressive reference samples sets a low bar, (iii) Commercial models approach human deception in zero-shot settings, while open-source systems still struggle with natural conversational speech; (iv) Fine-tuning on high-quality data improves realism but does not fully bridge the gap. Our findings underscore the need for more realistic, human-centric evaluations alongside existing subjective tests.",
        "paper_link": "https://arxiv.org/pdf/2508.04179",
        "website_link": "https://github.com/AI4Bharat/saffron",
        "github_link": "https://github.com/AI4Bharat/saffron",
        "hf_link": null,
        "paper_award": null,
        "license": []
    },
    {
        "id": 3,
        "title": "Aksharantar",
        "area": "XLIT",
        "published_on": "2022-05-06",
        "conference": "EMNLP",
        "description": "Transliteration is very important in the Indian language context due to the usage of multiple scripts and the widespread use of romanized inputs. However, few training and evaluation sets are publicly available. We introduce Aksharantar, the largest publicly available transliteration dataset for Indian languages created by mining from monolingual and parallel corpora, as well as collecting data from human annotators. The dataset contains 26 million transliteration pairs for 21 Indic languages from 3 language families using 12 scripts. Aksharantar is 21 times larger than existing datasets and is the first publicly available dataset for 7 languages and 1 language family. We also introduce the Aksharantar testset comprising 103k word pairs spanning 19 languages that enables a fine-grained analysis of transliteration models on native origin words, foreign words, frequent words, and rare words. Using the training set, we trained IndicXlit, a multilingual transliteration model that improves accuracy by 15% on the Dakshina test set, and establishes strong baselines on the Aksharantar testset introduced in this work. The models, mining scripts, transliteration guidelines, and datasets are available at this https URL under open-source licenses. We hope the availability of these large-scale, open resources will spur innovation for Indic language transliteration and downstream applications. We hope the availability of these large-scale, open resources will spur innovation for Indic language transliteration and downstream applications.",
        "paper_link": "https://arxiv.org/abs/2205.03018",
        "website_link": "https://huggingface.co/datasets/ai4bharat/Aksharantar",
        "github_link": "https://github.com/AI4Bharat/IndicXlit",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/Aksharantar",
        "paper_award": null,
        "license": []
    },
    {
        "id": 9,
        "title": "Bhasha-Abhijnaanam",
        "area": "XLIT",
        "published_on": "2023-07-09",
        "conference": "ACL",
        "description": "We create publicly available language identification (LID) datasets and models in all 22 Indian languages listed in the Indian constitution in both native-script and romanized text. First, we create Bhasha-Abhijnaanam, a language identification test set for native-script as well as romanized text which spans all 22 Indic languages. We also train IndicLID, a language identifier for all the above-mentioned languages in both native and romanized script. For native-script text, it has better language coverage than existing LIDs and is competitive or better than other LIDs. IndicLID is the first LID for romanized text in Indian languages. Two major challenges for romanized text LID are the lack of training data and low-LID performance when languages are similar. We provide simple and effective solutions to these problems. In general, there has been limited work on romanized text in any language, and our findings are relevant to other languages that need romanized language identification.",
        "paper_link": "https://arxiv.org/abs/2305.15814",
        "website_link": "https://huggingface.co/datasets/ai4bharat/Bhasha-Abhijnaanam",
        "github_link": "https://github.com/AI4Bharat/IndicLID",
        "hf_link": "https://huggingface.co/datasets/ai4bharat/Bhasha-Abhijnaanam",
        "paper_award": null,
        "license": []
    }
]