HTTP 200 OK
Allow: GET, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept
[
{
"id": 1,
"title": "IndicTrans2",
"area": "NMT",
"published_on": "2023-05-23",
"conference": "TMLR",
"description": "IndicTrans2 is the first open-source transformer-based multilingual NMT model that supports high-quality translations across all the 22 scheduled Indic languages — including multiple scripts for low-resouce languages like Kashmiri, Manipuri and Sindhi. It adopts script unification wherever feasible to leverage transfer learning by lexical sharing between languages. Overall, the model supports five scripts Perso-Arabic (Kashmiri, Sindhi, Urdu), Ol Chiki (Santali), Meitei (Manipuri), Latin (English), and Devanagari (used for all the remaining languages).",
"paper_link": "https://arxiv.org/abs/2305.16307",
"colab_link": "https://colab.research.google.com/github/AI4Bharat/IndicTrans2/blob/main/huggingface_interface/colab_inference.ipynb",
"website_link": "https://github.com/AI4Bharat/IndicTrans2",
"github_link": "https://github.com/AI4Bharat/IndicTrans2",
"service_id": "ai4bharat/indictrans--gpu-t4",
"hf_link": "https://huggingface.co/collections/ai4bharat/indictrans2-664ccb91d23bbae0d681c3ca",
"installation_steps_json": [
{
"instruction": "Setup and Installation",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Clone the IndicTransToolkit GitHub repository and navigate to the project directory",
"codeString": "git clone https://github.com/VarunGumma/IndicTransToolkit.git\ncd IndicTransToolkit",
"type": "instruction"
},
{
"instruction": "2. Create a Virtual Environment (either conda or venv) with Python>=3.8 and activate It. We recommend using a Virtual Environment to avoid dependency issues.",
"codeString": null,
"type": "instruction"
},
{
"instruction": "3. Install the IndicTransToolkit.",
"codeString": "pip install --editable ./",
"type": "instruction"
},
{
"instruction": "Additional Installation Notes",
"codeString": null,
"type": "heading"
},
{
"instruction": "We recommend you have access to a GPU for faster translation, or the code will fall back to the CPU.",
"codeString": null,
"type": "instruction"
}
],
"usage_steps_json": [
{
"instruction": "General Usage",
"codeString": null,
"type": "heading"
},
{
"instruction": "To run inference using the provided Python interface, please execute the following block of code:",
"codeString": null,
"type": "instruction"
},
{
"instruction": "Python Inference",
"codeString": null,
"type": "heading"
},
{
"instruction": "",
"codeString": "import torch\nfrom transformers import (\n AutoModelForSeq2SeqLM,\n AutoTokenizer,\n)\nfrom IndicTransToolkit import IndicProcessor\n\nmodel_name = \"ai4bharat/indictrans2-indic-en-1B\"\ntokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\nmodel = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)\nip = IndicProcessor(inference=True)\n\ninput_sentences = [\n \"जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।\",\n \"हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।\",\n \"अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।\",\n \"मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।\",\n]\n\nsrc_lang, tgt_lang = \"hin_Deva\", \"eng_Latn\"\n\nbatch = ip.preprocess_batch(\n input_sentences,\n src_lang=src_lang,\n tgt_lang=tgt_lang,\n)\n\nDEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\n# Tokenize the sentences and generate input encodings\ninputs = tokenizer(\n batch,\n truncation=True,\n padding=\"longest\",\n return_tensors=\"pt\",\n return_attention_mask=True,\n).to(DEVICE)\n\n# Generate translations using the model\nwith torch.no_grad():\n generated_tokens = model.generate(\n **inputs,\n use_cache=True,\n min_length=0,\n max_length=256,\n num_beams=5,\n num_return_sequences=1,\n )\n\n# Decode the generated tokens into text\nwith tokenizer.as_target_tokenizer():\n generated_tokens = tokenizer.batch_decode(\n generated_tokens.detach().cpu().tolist(),\n skip_special_tokens=True,\n clean_up_tokenization_spaces=True,\n )\n\n# Postprocess the translations, including entity replacement\ntranslations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)\n\nfor input_sentence, translation in zip(input_sentences, translations):\n print(f\"{src_lang}: {input_sentence}\")\n print(f\"{tgt_lang}: {translation}\")",
"type": "code"
}
],
"testimonials_json": null,
"latest": true,
"paper_award": null,
"license": []
},
{
"id": 2,
"title": "IndicWhisper",
"area": "ASR",
"published_on": "2023-05-24",
"conference": "INTERSPEECH",
"description": "Improving ASR systems is necessary to make new LLM-based use-cases accessible to people across the globe. In this paper, we focus on Indian languages, and make the case that diverse benchmarks are required to evaluate and improve ASR systems for Indian languages. To address this, we collate Vistaar as a set of 59 benchmarks across various language and domain combinations, on which we evaluate 3 publicly available ASR systems and 2 commercial systems. We also train IndicWhisper models by fine-tuning the Whisper models on publicly available training datasets across 12 Indian languages totalling to 10.7K hours. We show that IndicWhisper significantly improves on considered ASR systems on the Vistaar benchmark. Indeed, IndicWhisper has the lowest WER in 39 out of the 59 benchmarks, with an average reduction of 4.1 WER. We open-source all datasets, code and models.",
"paper_link": "https://arxiv.org/abs/2305.15386",
"colab_link": null,
"website_link": "https://github.com/AI4Bharat/vistaar",
"github_link": "https://github.com/AI4Bharat/vistaar",
"service_id": "ai4bharat/whisper--gpu-t4",
"hf_link": null,
"installation_steps_json": null,
"usage_steps_json": [
{
"instruction": "Inference using IndicWhisper",
"codeString": null,
"type": "heading"
},
{
"instruction": "Sample structure of manifest file",
"codeString": "{\"audio_filepath\":<path to audio file 1>}\n{\"audio_filepath\":<path to audio file 2>}\n...",
"type": "instruction"
},
{
"instruction": "Running batch inference",
"codeString": "deepspeed --include localhost:<gpus to include> \\\ntranscribe.py <manifest path> \\\n<model path> \\\n<current language> \\\n<batch size>\n<output path>",
"type": "instruction"
},
{
"instruction": "Running inference for a single audio file",
"codeString": "from transformers import pipeline\n\nmodel_path = \"hindi_models/whisper-medium-hi_alldata_multigpu\"\ndevice = \"cuda\"\nlang_code = \"hi\"\n\nwhisper_asr = pipeline(\n \"automatic-speech-recognition\", model=model_path, device=device,\n)\n\n# Special case to handle odia since odia is not supported by whisper model\nif lang_code == 'or':\n whisper_asr.model.config.forced_decoder_ids = (\n whisper_asr.tokenizer.get_decoder_prompt_ids(\n language=None, task=\"transcribe\"\n )\n )\nelse:\n whisper_asr.model.config.forced_decoder_ids = (\n whisper_asr.tokenizer.get_decoder_prompt_ids(\n language=lang_code, task=\"transcribe\"\n )\n )\n\nresult = whisper_asr(\"audio.mp3\")\nprint(result[\"text\"])",
"type": "instruction"
}
],
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 3,
"title": "IndicXlit",
"area": "XLIT",
"published_on": "2022-05-06",
"conference": "EMNLP",
"description": "Transliteration is very important in the Indian language context due to the usage of multiple scripts and the widespread use of romanized inputs. However, few training and evaluation sets are publicly available. We introduce Aksharantar, the largest publicly available transliteration dataset for Indian languages created by mining from monolingual and parallel corpora, as well as collecting data from human annotators. The dataset contains 26 million transliteration pairs for 21 Indic languages from 3 language families using 12 scripts. Aksharantar is 21 times larger than existing datasets and is the first publicly available dataset for 7 languages and 1 language family. We also introduce the Aksharantar testset comprising 103k word pairs spanning 19 languages that enables a fine-grained analysis of transliteration models on native origin words, foreign words, frequent words, and rare words. Using the training set, we trained IndicXlit, a multilingual transliteration model that improves accuracy by 15% on the Dakshina test set, and establishes strong baselines on the Aksharantar testset introduced in this work. The models, mining scripts, transliteration guidelines, and datasets are available at this https URL under open-source licenses. We hope the availability of these large-scale, open resources will spur innovation for Indic language transliteration and downstream applications. We hope the availability of these large-scale, open resources will spur innovation for Indic language transliteration and downstream applications.",
"paper_link": "https://arxiv.org/abs/2205.03018",
"colab_link": "https://colab.research.google.com/drive/1P78Tbr6zhe-5LeiKk525N3SGPKn2ofGg?usp=sharing",
"website_link": "https://github.com/AI4Bharat/IndicXlit",
"github_link": "https://github.com/AI4Bharat/IndicXlit",
"service_id": "ai4bharat/indicxlit--gpu-t4",
"hf_link": null,
"installation_steps_json": [
{
"instruction": "Installing the Python library, that is a wrapper around IndicXlit model",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Install the library",
"codeString": "!pip install ai4bharat-transliteration",
"type": "instruction"
},
{
"instruction": "Note: For thorough documentation, visit the official page",
"codeString": "https://pypi.org/project/ai4bharat-transliteration/",
"type": "instruction"
},
{
"instruction": "2. Import the module for transliteration engine",
"codeString": "from ai4bharat.transliteration import XlitEngine",
"type": "instruction"
},
{
"instruction": "Note: The model supports the following languages",
"codeString": "[as, bn, brx, gom, gu, hi, kn, ks, mai, ml, mni, mr, ne, or, pa, sa, sd, si, ta, te, ur]",
"type": "instruction"
},
{
"instruction": "3. Using word transliteration",
"codeString": "# Initializing the en-indic multilingual model and dictionaries (if rerank option is True)\ne = XlitEngine(\"hi\", beam_width=4, rescore=True, src_script_type=\"en\")\n\n# Transliterate word\nout = e.translit_word(\"one\", topk=1)\nprint(out)",
"type": "instruction"
},
{
"instruction": "Note: `beam_width` increases beam search size, improving accuracy but increasing time/compute. (Default: 4)",
"codeString": null,
"type": "instruction"
},
{
"instruction": "Note: `topk` returns only the specified number of top results. (Default: 4)",
"codeString": null,
"type": "instruction"
},
{
"instruction": "Note: `rescore` returns reranked suggestions after using a dictionary. (Default: True)",
"codeString": null,
"type": "instruction"
}
],
"usage_steps_json": null,
"testimonials_json": null,
"latest": true,
"paper_award": null,
"license": []
},
{
"id": 4,
"title": "IndicWav2Vec",
"area": "ASR",
"published_on": "2021-11-06",
"conference": "AAAI",
"description": "Recent methods in speech and language technology pretrain very LARGE models which are fine-tuned for specific tasks. However, the benefits of such LARGE models are often limited to a few resource rich languages of the world. In this work, we make multiple contributions towards building ASR systems for low resource languages from the Indian subcontinent. First, we curate 17,000 hours of raw speech data for 40 Indian languages from a wide variety of domains including education, news, technology, and finance. Second, using this raw speech data we pretrain several variants of wav2vec style models for 40 Indian languages. Third, we analyze the pretrained models to find key features: codebook vectors of similar sounding phonemes are shared across languages, representations across layers are discriminative of the language family, and attention heads often pay attention within small local windows. Fourth, we fine-tune this model for downstream ASR for 9 languages and obtain state-of-the-art results on 3 public datasets, including on very low-resource languages such as Sinhala and Nepali. Our work establishes that multilingual pretraining is an effective strategy for building ASR systems for the linguistically diverse speakers of the Indian subcontinent. Our code, data and models are available publicly at this https URL and we hope they will help advance research in ASR for Indic languages.",
"paper_link": "https://arxiv.org/abs/2111.03945",
"colab_link": null,
"website_link": "https://github.com/AI4Bharat/IndicWav2Vec",
"github_link": "https://github.com/AI4Bharat/IndicWav2Vec",
"service_id": "ai4bharat/conformer-multilingual-dravidian--gpu-t4,ai4bharat/conformer-multilingual-indo-aryan--gpu-t4,ai4bharat/conformer-hi--gpu-t4",
"hf_link": null,
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 5,
"title": "IndicTTS",
"area": "TTS",
"published_on": "2022-11-17",
"conference": "ICASSP",
"description": "Deep learning based text-to-speech (TTS) systems have been evolving rapidly with advances in model architectures, training methodologies, and generalization across speakers and languages. However, these advances have not been thoroughly investigated for Indian language speech synthesis. Such investigation is computationally expensive given the number and diversity of Indian languages, relatively lower resource availability, and the diverse set of advances in neural TTS that remain untested. In this paper, we evaluate the choice of acoustic models, vocoders, supplementary loss functions, training schedules, and speaker and language diversity for Dravidian and Indo-Aryan languages. Based on this, we identify monolingual models with FastPitch and HiFi-GAN V1, trained jointly on male and female speakers to perform the best. With this setup, we train and evaluate TTS models for 13 languages and find our models to significantly improve upon existing models in all languages as measured by mean opinion scores. We open-source all models on the Bhashini platform.",
"paper_link": "https://arxiv.org/pdf/2211.09536",
"colab_link": "https://colab.research.google.com/drive/10XS50NY0TtAnqTKmgIj3iNLjf7A-lGJz?usp=sharing",
"website_link": "https://github.com/AI4Bharat/Indic-TTS",
"github_link": "https://github.com/AI4Bharat/Indic-TTS",
"service_id": "ai4bharat/indic-tts-dravidian--gpu-t4,ai4bharat/indic-tts-misc--gpu-t4,ai4bharat/indic-tts-indo-aryan--gpu-t4",
"hf_link": null,
"installation_steps_json": [
{
"instruction": "1. Create environment",
"codeString": "sudo apt-get install libsndfile1-dev ffmpeg enchant\nconda create -n tts-env\nconda activate tts-env",
"type": "instruction"
},
{
"instruction": "2. Setup PyTorch",
"codeString": "pip3 install -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113",
"type": "instruction"
},
{
"instruction": "3. Setup Trainer",
"codeString": "git clone https://github.com/gokulkarthik/Trainer\n\ncd Trainer\npip3 install -e .[all]\ncd ..",
"type": "instruction"
},
{
"instruction": "Alternative: Copy Trainer files for fixes",
"codeString": "cp Trainer/trainer/logging/wandb_logger.py to the local Trainer installation\ncp Trainer/trainer/trainer.py to the local Trainer installation\nadd `gpus = [str(gpu) for gpu in gpus]` in line 53 of trainer/distribute.py",
"type": "instruction"
},
{
"instruction": "4. Setup TTS",
"codeString": "git clone https://github.com/gokulkarthik/TTS\n\ncd TTS\npip3 install -e .[all]\ncd ..",
"type": "instruction"
},
{
"instruction": "Alternative: Copy TTS files for fixes",
"codeString": "cp TTS/TTS/bin/synthesize.py to the local TTS installation",
"type": "instruction"
},
{
"instruction": "5. Install other requirements",
"codeString": "pip3 install -r requirements.txt",
"type": "instruction"
},
{
"instruction": "Code Reference",
"codeString": "https://github.com/coqui-ai/TTS",
"type": "instruction"
}
],
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 6,
"title": "IndicTrans",
"area": "NMT",
"published_on": "2021-04-12",
"conference": "TACL",
"description": "IndicTrans is a Transformer based multilingual NMT model trained on the Samanantar dataset which is the largest publicly available parallel corpora collection for Indic languages at the time of release.",
"paper_link": "https://arxiv.org/abs/2104.05596",
"colab_link": "https://github.com/AI4Bharat/indicTrans/blob/main/indicTrans_python_interface.ipynb",
"website_link": "https://github.com/AI4Bharat/indicTrans",
"github_link": "https://github.com/AI4Bharat/indicTrans",
"service_id": null,
"hf_link": null,
"installation_steps_json": [
{
"instruction": "Setup and Installation",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Clone the IndicTrans GitHub repository and navigate to the project directory",
"codeString": "git clone https://github.com/AI4Bharat/indicTrans.git\ncd indicTrans",
"type": "instruction"
},
{
"instruction": "2. Create a Virtual Environment (either conda or venv) with Python>=3.8 and activate It. We recommend using a Virtual Environment to avoid dependency issues.",
"codeString": null,
"type": "instruction"
},
{
"instruction": "3. Download required libraries",
"codeString": "git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\ngit clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\ngit clone https://github.com/rsennrich/subword-nmt.git",
"type": "instruction"
},
{
"instruction": "4. Install required libraries",
"codeString": "pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library",
"type": "instruction"
},
{
"instruction": "5. Install fairseq.",
"codeString": "git clone https://github.com/pytorch/fairseq.git\ncd fairseq\ngit checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\npip install --editable ./",
"type": "instruction"
},
{
"instruction": "Additional Installation Notes",
"codeString": null,
"type": "heading"
},
{
"instruction": "We recommend you have access to a GPU for faster translation, or the code will fall back to the CPU.",
"codeString": null,
"type": "instruction"
}
],
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 7,
"title": "IndicBERT",
"area": "LLM",
"published_on": "2020-12-01",
"conference": "EMNLP",
"description": "IndicBERT is a multilingual ALBERT model pretrained on an Indic monolingual corpus of around 9 billion tokens, covering 12 major Indian languages. It has been evaluated on a diverse set of tasks from IndicGLUE to assess its performance.",
"paper_link": "https://aclanthology.org/2020.findings-emnlp.445.pdf",
"colab_link": null,
"website_link": "https://github.com/AI4Bharat/Indic-BERT-v1",
"github_link": "https://github.com/AI4Bharat/Indic-BERT-v1",
"service_id": null,
"hf_link": "https://huggingface.co/ai4bharat/indic-bert",
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 8,
"title": "IndicBERTv2",
"area": "LLM",
"published_on": "2022-12-11",
"conference": "ACL",
"description": "IndicBERT v2 is a multilingual BERT model pretrained on IndicCorp v2, an Indic monolingual corpus of 20.9 billion tokens, covering 24 consitutionally recognised Indian languages. It has been evaluated on a diverse set of tasks from IndicXTREME to assess its performance.",
"paper_link": "https://arxiv.org/abs/2212.05409",
"colab_link": "https://colab.research.google.com/drive/1nBTATMReFugH-w1glMaJ5Jsi8PhmtrDl?usp=sharing",
"website_link": "https://github.com/AI4Bharat/IndicBERT",
"github_link": "https://github.com/AI4Bharat/IndicBERT",
"service_id": null,
"hf_link": "https://huggingface.co/collections/ai4bharat/indicbert-v2-66c5a0bd4ee34ebc59303bc5",
"installation_steps_json": [
{
"instruction": "Setup and Installation",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Clone the IndicBERT GitHub repository and navigate to the project directory",
"codeString": "git clone https://github.com/AI4Bharat/IndicBERT.git\ncd IndicBERT",
"type": "instruction"
},
{
"instruction": "2. Create a Virtual Environment and install the required libraries",
"codeString": "conda create -n indicbert python=3.9\npip install -r requirements.txt",
"type": "instruction"
},
{
"instruction": "Additional Installation Notes",
"codeString": null,
"type": "heading"
},
{
"instruction": "We recommend you have access to a GPU for faster translation, or the code will fall back to the CPU.",
"codeString": null,
"type": "instruction"
}
],
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 9,
"title": "IndicNER",
"area": "LLM",
"published_on": "2022-12-04",
"conference": "ACL",
"description": "IndicNER is a model trained to complete the task of identifying named entities from sentences in Indian languages. Our model is specifically fine-tuned to the 11 Indian languages mentioned above over millions of sentences.",
"paper_link": "https://arxiv.org/abs/2212.10168",
"colab_link": "https://colab.research.google.com/drive/1sYa-PDdZQ_c9SzUgnhyb3Fl7j96QBCS8?usp=sharing",
"website_link": null,
"github_link": null,
"service_id": null,
"hf_link": "https://huggingface.co/ai4bharat/IndicNER",
"installation_steps_json": [
{
"instruction": "Installation Instructions",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Create a virtual environment (either venv or conda). We recommend using virtual environments to avoid dependency issues.",
"codeString": null,
"type": "instruction"
},
{
"instruction": "2. Install the Required Python Packages",
"codeString": "pip3 install transformers datasets sentencepiece seqeval",
"type": "instruction"
},
{
"instruction": "Note: Ensure that Python >= 3.8 is installed and properly configured in your environment.",
"codeString": null,
"type": "instruction"
}
],
"usage_steps_json": [
{
"instruction": "Running the IndicNER Model",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Import Necessary Libraries and Initialize the Model",
"codeString": "from transformers import AutoTokenizer, AutoModelForTokenClassification\nimport torch\n\ntokenizer = AutoTokenizer.from_pretrained(\"ai4bharat/IndicNER\")\nmodel = AutoModelForTokenClassification.from_pretrained(\"ai4bharat/IndicNER\")",
"type": "instruction"
},
{
"instruction": "2. Define the Prediction Function",
"codeString": "def get_predictions( sentence, tokenizer, model ):\n tok_sentence = tokenizer(sentence, return_tensors='pt')\n\n with torch.no_grad():\n logits = model(**tok_sentence).logits.argmax(-1)\n predicted_tokens_classes = [model.config.id2label[t.item()] for t in logits[0]]\n\n predicted_labels = []\n previous_token_id = 0\n word_ids = tok_sentence.word_ids()\n for word_index in range(len(word_ids)):\n if word_ids[word_index] == None:\n previous_token_id = word_ids[word_index]\n elif word_ids[word_index] == previous_token_id:\n previous_token_id = word_ids[word_index]\n else:\n predicted_labels.append( predicted_tokens_classes[ word_index ] )\n previous_token_id = word_ids[word_index]\n\n return predicted_labels",
"type": "instruction"
},
{
"instruction": "3. Test the Model with Example Sentences",
"codeString": "sentence = 'लगातार हमलावर हो रहे शिवपाल और राजभर को सपा की दो टूक, चिट्ठी जारी कर कहा- जहां जाना चाहें जा सकते हैं'\npredicted_labels = get_predictions(sentence=sentence, tokenizer=tokenizer, model=model)\nfor index in range(len(sentence.split(' '))):\n print( sentence.split(' ')[index] + '\\t' + predicted_labels[index] )\n\nsentence = 'ಶರಣ್ ರ ನೀವು ನೋಡಲೇಬೇಕಾದ ಟಾಪ್ 5 ಕಾಮಿಡಿ ಚಲನಚಿತ್ರಗಳು'\npredicted_labels = get_predictions(sentence=sentence, tokenizer=tokenizer, model=model)\nfor index in range(len(sentence.split(' '))):\n print( sentence.split(' ')[index] + '\\t' + predicted_labels[index] )",
"type": "instruction"
}
],
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 10,
"title": "Airavata",
"area": "LLM",
"published_on": "2024-01-04",
"conference": "ARXIV",
"description": "Airavata is a 7B OpenHathi model finetuned on IndicInstruct dataset which is a collection of instruction datasets (Anudesh, wikiHow, Flan v2, Dolly, Anthropic-HHH, OpenAssistant v1, and LymSys-Chat)",
"paper_link": "https://arxiv.org/abs/2401.15006",
"colab_link": null,
"website_link": "https://ai4bharat.github.io/airavata/",
"github_link": "https://github.com/AI4Bharat/IndicInstruct",
"service_id": null,
"hf_link": "https://huggingface.co/ai4bharat/Airavata",
"installation_steps_json": [
{
"instruction": "Setup and Installation",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Create a virtual environment (either venv or conda). We recommend using virtual environments to avoid dependency issues.",
"codeString": null,
"type": "instruction"
},
{
"instruction": "2. Upgrade pip and Install Required Libraries",
"codeString": "pip install --upgrade pip\npip install torch transformers",
"type": "instruction"
},
{
"instruction": "3. Ensure CUDA is Installed and Available (Optional for GPU Use)",
"codeString": "If you have an NVIDIA GPU, make sure CUDA is installed and configured properly. You can check if CUDA is available with the following code in Python:\n\nimport torch\nprint(torch.cuda.is_available())",
"type": "instruction"
},
{
"instruction": "Additional Notes",
"codeString": null,
"type": "heading"
},
{
"instruction": "Make sure your environment has Python >= 3.8.",
"codeString": null,
"type": "instruction"
}
],
"usage_steps_json": [
{
"instruction": "Running the Airavata Model Code",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Import Required Libraries and Define the Device",
"codeString": "import torch\nfrom transformers import AutoTokenizer, AutoModelForCausalLM\n\ndevice = 'cuda' if torch.cuda.is_available() else 'cpu'",
"type": "instruction"
},
{
"instruction": "2. Define the Functions for Prompt Creation and Inference",
"codeString": "def create_prompt_with_chat_format(messages, bos='<s>', eos='</s>', add_bos=True):\n formatted_text = ''\n for message in messages:\n if message['role'] == 'system':\n formatted_text += '<|system|>\\n' + message['content'] + '\\n'\n elif message['role'] == 'user':\n formatted_text += '<|user|>\\n' + message['content'] + '\\n'\n elif message['role'] == 'assistant':\n formatted_text += '<|assistant|>\\n' + message['content'].strip() + eos + '\\n'\n else:\n raise ValueError('Tulu chat template only supports system, user, and assistant roles. Invalid role: {}'.format(message['role']))\n formatted_text += '<|assistant|>\\n'\n formatted_text = bos + formatted_text if add_bos else formatted_text\n return formatted_text\n\n\ndef inference(input_prompts, model, tokenizer):\n input_prompts = [create_prompt_with_chat_format([{'role': 'user', 'content': input_prompt}], add_bos=False) for input_prompt in input_prompts]\n encodings = tokenizer(input_prompts, padding=True, return_tensors='pt')\n encodings = encodings.to(device)\n with torch.inference_mode():\n outputs = model.generate(encodings.input_ids, do_sample=False, max_new_tokens=250)\n output_texts = tokenizer.batch_decode(outputs.detach(), skip_special_tokens=True)\n input_prompts = [tokenizer.decode(tokenizer.encode(input_prompt), skip_special_tokens=True) for input_prompt in input_prompts]\n output_texts = [output_text[len(input_prompt):] for input_prompt, output_text in zip(input_prompts, output_texts)]\n return output_texts",
"type": "instruction"
},
{
"instruction": "3. Load the Model and Tokenizer",
"codeString": "model_name = 'ai4bharat/Airavata'\n\ntokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')\ntokenizer.pad_token = tokenizer.eos_token\nmodel = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)",
"type": "instruction"
},
{
"instruction": "4. Define Input Prompts and Run Inference",
"codeString": "input_prompts = [\n 'मैं अपने समय प्रबंधन कौशल को कैसे सुधार सकता हूँ? मुझे पांच बिंदु बताएं।',\n 'मैं अपने समय प्रबंधन कौशल को कैसे सुधार सकता हूँ? मुझे पांच बिंदु बताएं और उनका वर्णन करें।',\n]\n\noutputs = inference(input_prompts, model, tokenizer)\nprint(outputs)",
"type": "instruction"
}
],
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 11,
"title": "IndicBART",
"area": "LLM",
"published_on": "2022-03-04",
"conference": "ACL",
"description": "IndicBART is a multilingual, sequence-to-sequence pre-trained model focusing on 11 Indic languages and English. IndicBART utilizes the orthographic similarity between Indic scripts to improve transfer learning between similar Indic languages. Supported languages include Assamese, Bengali, Gujarati, Hindi, Marathi, Odiya, Punjabi, Kannada, Malayalam, Tamil, Telugu and English.\r\n\r\nYou can use IndicBART model to build natural language generation applications for Indian languages by finetuning the model with supervised training data for tasks like machine translation, summarization, question generation, etc",
"paper_link": "https://arxiv.org/abs/2109.02903",
"colab_link": "https://colab.research.google.com/drive/13Gj7bAhR2HIdgSXEzp8fu4xwqrKsEYaa?usp=sharing",
"website_link": "https://github.com/AI4Bharat/indic-bart",
"github_link": "https://github.com/AI4Bharat/indic-bart",
"service_id": null,
"hf_link": "https://huggingface.co/ai4bharat/IndicBART",
"installation_steps_json": [
{
"instruction": "Setup and Installation",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Create a Virtual Environment (either venv or conda). We recommend using virtual environments to avoid dependency issues.",
"codeString": null,
"type": "instruction"
},
{
"instruction": "2. Install Required Python Libraries",
"codeString": "pip install torch transformers",
"type": "instruction"
},
{
"instruction": "Note: Make sure your Python version is >=3.8.",
"codeString": null,
"type": "instruction"
}
],
"usage_steps_json": [
{
"instruction": "Running the IndicBART Model",
"codeString": null,
"type": "heading"
},
{
"instruction": "Execute the following Python code to run the IndicBART model:",
"codeString": "from transformers import MBartForConditionalGeneration, AutoModelForSeq2SeqLM\nfrom transformers import AutoTokenizer\n\ntokenizer = AutoTokenizer.from_pretrained(\"ai4bharat/IndicBART\", do_lower_case=False, use_fast=False, keep_accents=True)\n\nmodel = AutoModelForSeq2SeqLM.from_pretrained(\"ai4bharat/IndicBART\")\n\n# Or use model = MBartForConditionalGeneration.from_pretrained(\"ai4bharat/IndicBART\")\n\n# Some initial mapping\nbos_id = tokenizer._convert_token_to_id_with_added_voc(\"<s>\")\neos_id = tokenizer._convert_token_to_id_with_added_voc(\"</s>\")\npad_id = tokenizer._convert_token_to_id_with_added_voc(\"<pad>\")\n\n# First tokenize the input and outputs. The format below is how IndicBART was trained so the input should be \"Sentence </s> <2xx>\" where xx is the language code. Similarly, the output should be \"<2yy> Sentence </s>\".\ninp = tokenizer(\"I am a boy </s> <2en>\", add_special_tokens=False, return_tensors=\"pt\", padding=True).input_ids \n\nout = tokenizer(\"<2hi> मैं एक लड़का हूँ </s>\", add_special_tokens=False, return_tensors=\"pt\", padding=True).input_ids \n\nmodel_outputs=model(input_ids=inp, decoder_input_ids=out[:,0:-1], labels=out[:,1:])\n\n# For generation. Pardon the messiness. Note the decoder_start_token_id.\n\nmodel.eval() # Set dropouts to zero\n\nmodel_output=model.generate(inp, use_cache=True, num_beams=4, max_length=20, min_length=1, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc(\"<2en>\"))\n\n# Decode to get output strings\n\ndecoded_output=tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)\n\nprint(decoded_output) # I am a boy",
"type": "instruction"
}
],
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 12,
"title": "IndicLID",
"area": "XLIT",
"published_on": "2023-05-04",
"conference": "ACL",
"description": "IndicLID is a publicly available language identification datasets for all 22 Indian languages in both native-script and romanized text. It is the first LID for romanized text in Indian languages and can predict 47 classes (24 native-script classes and 21 roman-script classes plus English and Others).",
"paper_link": "https://aclanthology.org/2023.acl-short.71",
"colab_link": "https://colab.research.google.com/drive/1pLMeaGhYgfNRmYHPHkvAmcR-8xMMZOme?usp=sharing",
"website_link": "https://github.com/AI4Bharat/IndicLID",
"github_link": "https://github.com/AI4Bharat/IndicLID",
"service_id": null,
"hf_link": null,
"installation_steps_json": [
{
"instruction": "Setup and Installation",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Install Required Python Packages",
"codeString": "pip3 install fasttext\npip3 install transformers",
"type": "instruction"
},
{
"instruction": "2. Clone the GitHub Repository and Navigate to the Inference Directory",
"codeString": "git clone https://github.com/AI4Bharat/IndicLID.git\ncd IndicLID/Inference",
"type": "instruction"
},
{
"instruction": "3. Create a Directory for Models and Navigate to It",
"codeString": "mkdir models\ncd models",
"type": "instruction"
},
{
"instruction": "4. Download the IndicLID Model Files",
"codeString": "wget https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/indiclid-bert.zip\nwget https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/indiclid-ftn.zip\nwget https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/indiclid-ftr.zip",
"type": "instruction"
},
{
"instruction": "5. Unzip the Downloaded Model Files",
"codeString": "unzip indiclid-bert.zip\nunzip indiclid-ftn.zip\nunzip indiclid-ftr.zip",
"type": "instruction"
},
{
"instruction": "6. Navigate Back to the Inference Directory",
"codeString": "cd ..",
"type": "instruction"
}
],
"usage_steps_json": [
{
"instruction": "Running the IndicLID Model",
"codeString": null,
"type": "heading"
},
{
"instruction": "1. Run the Following Code to Perform Language Identification Using the IndicLID Model",
"codeString": "# Import the IndicLID class\nfrom ai4bharat.IndicLID import IndicLID\n\n# Initialize the IndicLID model with specified thresholds\nIndicLID_model = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)\n\n# Define the test samples\n# These samples include both native script and Romanized script text\n# Modify the samples as needed to test different inputs\n\n# Test samples for prediction\ntest_samples = [\n 'आज के दिन का मौसम अत्यंत सुंदर है, जहां सदैव छाए हुए बादल, गुलाबी रंगीन शाम, और हल्की हवा के साथ प्राकृतिक सौंदर्य का आनंद लेने का एक सुनहरा अवसर है',\n 'aaj key din ka mausam atyant sundar hai, jahan sadaiv chae hue baadal, gulabi rangeen shaam, aur halki havaa key saath praakritik saundarya kaa anand lene kaa aeka sunhara avsar haye',\n]\n\n# Set the batch size for predictions\nbatch_size = 1\n\n# Run the batch prediction\noutputs = IndicLID_model.batch_predict(test_samples, batch_size)\n\n# Print the outputs\n# This will display the language identification results for each sample\nprint(outputs)",
"type": "instruction"
}
],
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 13,
"title": "RomanSetu",
"area": "LLM",
"published_on": "2024-06-04",
"conference": "ACL",
"description": "RomanSetu proposes an approach that utilizes the romanized form of text as an interface for LLMs, hypothesizing that its frequent informal use and shared tokens with English enhance crosslingual alignment. The results indicate that romanized text not only reduces token fertility by 2x-4x but also matches / outperforms native script representation across various NLU, NLG, and MT tasks.",
"paper_link": "https://arxiv.org/abs/2401.14280",
"colab_link": null,
"website_link": "https://github.com/AI4Bharat/romansetu",
"github_link": "https://github.com/AI4Bharat/romansetu",
"service_id": null,
"hf_link": null,
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 14,
"title": "IndicConformer",
"area": "ASR",
"published_on": "2024-09-07",
"conference": null,
"description": "AI4Bharat's IndicConformers is a suite of ASR models built to deliver accurate speech-to-text conversion in all 22 official Indian languages. By leveraging cutting-edge deep learning techniques, these models provide precise transcriptions. As the country's first open-source ASR system covering such a vast array of languages, AI4Bharat Indic Conformer is a transformative tool for making technology more inclusive and accessible to all. IndicConformer is released under the MIT license.",
"paper_link": null,
"colab_link": "https://colab.research.google.com/drive/1ZQJEhYgLKS72_V4LvNmsyU2zF9pICRvE",
"website_link": "https://ai4bharat.github.io/ai4b-website/areas/model/ASR/IndicConformer",
"github_link": "https://github.com/AI4Bharat/IndicConformerASR",
"service_id": "ai4bharat/conformer-multilingual-all--gpu-t4",
"hf_link": "https://huggingface.co/collections/ai4bharat/indicconformer-66d9e933a243cba4b679cb7f",
"installation_steps_json": [
{
"instruction": "Setting up conda",
"codeString": null,
"type": "heading"
},
{
"instruction": "Creating and activating conda environment",
"codeString": "conda create -n temo python=3.10\nconda activate temo",
"type": "instruction"
},
{
"instruction": "Installing libraries",
"codeString": "pip3 install torch torchvision torchaudio\npip install packaging\npip install huggingface_hub==0.23.2",
"type": "instruction"
},
{
"instruction": "Cloning repository",
"codeString": "git clone https://github.com/AI4Bharat/NeMo.git\ncd NeMo\nbash reinstall.sh",
"type": "instruction"
}
],
"usage_steps_json": [
{
"instruction": "Inference",
"codeString": null,
"type": "heading"
},
{
"instruction": "Download the model checkpoints from the GitHub repository.",
"codeString": "https://github.com/AI4Bharat/IndicConformerASR",
"type": "instruction"
},
{
"instruction": "Loading the checkpoint",
"codeString": "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\nmodel = nemo_asr.models.EncDecCTCModel.restore_from(restore_path='<CHECKPOINT_PATH>.nemo')\nmodel.freeze()\nmodel = model.to(device)",
"type": "instruction"
},
{
"instruction": "CTC Decoding",
"codeString": "model.cur_decoder = 'ctc'\nctc_text = model.transcribe(['/path/audio_path.wav'], batch_size=1,logprobs=False, language_id='LANG_ID')[0]",
"type": "instruction"
},
{
"instruction": "RNN-T Decoding",
"codeString": "model.cur_decoder = 'rnnt'\nctc_text = model.transcribe(['/path/audio_path.wav'], batch_size=1, language_id='LANG_ID')[0]",
"type": "instruction"
}
],
"testimonials_json": null,
"latest": true,
"paper_award": null,
"license": []
},
{
"id": 15,
"title": "Hercule",
"area": "LLM",
"published_on": "2024-10-17",
"conference": "ACL",
"description": "Hercule is a cross-lingual evaluation model introduced as part of the CIA Suite to assess multilingual Large Language Models (LLMs). It addresses the challenge of evaluating multilingual LLMs by using English reference responses to score multilingual outputs.\r\n\r\nFine-tuned on the INTEL dataset, Hercule demonstrates better alignment with human judgments compared to zero-shot evaluations by proprietary models like GPT-4, on the RECON test set. It excels particularly in low-resource scenarios and supports zero-shot evaluations on unseen languages. The model employs reference-based evaluation, providing feedback and scores on a 1-5 scale, and highlights the effectiveness of lightweight fine-tuning methods (like LoRA) for efficient multilingual evaluation.",
"paper_link": "https://arxiv.org/abs/2410.13394",
"colab_link": null,
"website_link": "https://huggingface.co/collections/ai4bharat/cia-suite-66ea9a7e18a6c70bd8de27a1",
"github_link": "https://github.com/AI4Bharat/CIA",
"service_id": null,
"hf_link": "https://huggingface.co/collections/ai4bharat/cia-suite-66ea9a7e18a6c70bd8de27a1",
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 16,
"title": "ELAICHI",
"area": "TTS",
"published_on": "2024-10-23",
"conference": "Under Review",
"description": "Recent advancements in Text-to-Speech (TTS) technology have led to natural-sounding speech for English, primarily due to the availability of large-scale, high-quality web data. However, many other languages lack access to such resources, relying instead on limited studio-quality data. This scarcity results in synthesized speech that often suffers from intelligibility issues, particularly with low-frequency character bigrams. In this paper, we propose three solutions to address this challenge. First, we leverage high-quality data from linguistically or geographically related languages to improve TTS for the target language. Second, we utilize low-quality Automatic Speech Recognition (ASR) data recorded in non-studio environments, which is refined using denoising and speech enhancement models. Third, we apply knowledge distillation from large-scale models using synthetic data to generate more robust outputs. Our experiments with Hindi demonstrate significant reductions in intelligibility issues, as validated by human evaluators. We propose this methodology as a viable alternative for languages with limited access to high-quality data, enabling them to collectively benefit from shared resources.",
"paper_link": "https://arxiv.org/pdf/2410.17901",
"colab_link": null,
"website_link": null,
"github_link": "https://github.com/AI4Bharat/ELAICHI",
"service_id": null,
"hf_link": "https://huggingface.co/collections/ai4bharat/elaichi-671a1cb837b351a28d1c5a8b",
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 17,
"title": "Indic Parler TTS",
"area": "TTS",
"published_on": "2025-05-19",
"conference": "INTERSPEECH",
"description": "Indic Parler-TTS is a state-of-the-art TTS system, that brings voices to life in 23 Indian languages and English, delivering realistic, expressive, and highly controllable speech synthesis. With fine-grained text control over speaker identity, emotion, and prosody, IndicParler-TTS eliminates the need for reference speech prompts, making speech generation more flexible and accessible. It supports a wide range of acoustic conditions, from studio-quality clarity to spontaneous, real-world environments, enabling diverse applications like assistive technology, audiobooks, dubbing, and conversational speech. As the first open-source TTS system covering 23 Indian languages, IndicParler-TTS is released under the Apache 2.0 license, empowering researchers, developers, and businesses to advance speech AI for India’s multilingual landscape.",
"paper_link": "https://arxiv.org/pdf/2505.18609v2",
"colab_link": null,
"website_link": null,
"github_link": null,
"service_id": null,
"hf_link": "https://huggingface.co/ai4bharat/indic-parler-tts",
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": [
{
"id": 1,
"license_name": "cc-by-4.0",
"liense_url": "https://choosealicense.com/licenses/cc-by-4.0/"
}
]
},
{
"id": 18,
"title": "IndicF5",
"area": "TTS",
"published_on": "2025-03-13",
"conference": null,
"description": "IndicF5 is a near-human polyglot Text-to-Speech (TTS) model trained on 1417 hours of high-quality speech from Rasa, IndicTTS, LIMMITS, and IndicVoices-R.",
"paper_link": null,
"colab_link": null,
"website_link": null,
"github_link": "https://github.com/AI4Bharat/IndicF5",
"service_id": null,
"hf_link": "https://huggingface.co/ai4bharat/IndicF5",
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": true,
"paper_award": null,
"license": [
{
"id": 1,
"license_name": "cc-by-4.0",
"liense_url": "https://choosealicense.com/licenses/cc-by-4.0/"
}
]
},
{
"id": 19,
"title": "CTQAScorer",
"area": "NMT",
"published_on": "2023-05-23",
"conference": "EMNLP",
"description": "CTQ Scorer (Contextual Translation Quality), a regression model, that selects examples based on multiple features in order to maximize the translation quality",
"paper_link": "https://arxiv.org/abs/2305.14105",
"colab_link": null,
"website_link": null,
"github_link": null,
"service_id": null,
"hf_link": null,
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 20,
"title": "DecoMT",
"area": "NMT",
"published_on": "2023-12-06",
"conference": "EMNLP",
"description": "A novel approach of fewshot prompting that decomposes the translation process into a sequence of word chunk\r\ntranslations",
"paper_link": "https://aclanthology.org/2023.emnlp-main.279.pdf",
"colab_link": null,
"website_link": null,
"github_link": null,
"service_id": null,
"hf_link": null,
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 21,
"title": "OpenHands",
"area": "SLR",
"published_on": "2022-05-16",
"conference": "ACL",
"description": "A library where we take four key ideas from the NLP community for low-resource languages and apply them to sign languages for word-level recognition",
"paper_link": "https://arxiv.org/abs/2110.05877",
"colab_link": null,
"website_link": null,
"github_link": null,
"service_id": null,
"hf_link": null,
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 22,
"title": "Sign2Vec",
"area": "SLR",
"published_on": "2022-05-11",
"conference": "NeurIPS",
"description": "A graph-based model with 5.2M parameters that is pretrained on SignCorpus.",
"paper_link": "https://papers.nips.cc/paper_files/paper/2022/file/eb011fd258c763c44d8c6a0e9ce04f17-Paper-Datasets_and_Benchmarks.pdf",
"colab_link": null,
"website_link": null,
"github_link": null,
"service_id": null,
"hf_link": null,
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 23,
"title": "Roman Lens",
"area": "LLM",
"published_on": "2025-01-13",
"conference": "ACL",
"description": "Large Language Models (LLMs) exhibit remarkable multilingual generalization despite being predominantly trained on English-centric corpora. A fundamental question arises: how do LLMs achieve such robust multilingual capabilities? For non-Latin script languages, we investigate the role of romanization—the representation of non-Latin scripts using Latin characters—as a bridge in multilingual processing. Using mechanistic interpretability techniques, we analyze next-token generation and find that intermediate layers frequently represent target words in romanized form before transitioning to native script, a phenomenon we term Latent Romanization. Further, through activation patching experiments, we demonstrate that LLMs encode semantic concepts similarly across native and romanized scripts, suggesting a shared underlying representation. Additionally in translation towards non Latin languages, our findings reveal that when the target language is in romanized form, its representations emerge earlier in the model’s layers compared to native script. These insights contribute to a deeper understanding of multilingual representation in LLMs and highlight the implicit role of romanization in facilitating language transfer. Our work provides new directions for potentially improving multilingual language modeling and interpretability.",
"paper_link": "http://arxiv.org/html/2502.07424v1",
"colab_link": null,
"website_link": null,
"github_link": null,
"service_id": null,
"hf_link": null,
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 24,
"title": "Sarvam-Translate",
"area": "NMT",
"published_on": "2025-06-13",
"conference": null,
"description": "Sarvam-Translate is an advanced translation model built in collaboration with Sarvam AI, specifically designed for comprehensive, document-level translation across the 22 official Indian languages, built on Gemma3-4B-IT. It addresses modern translation needs by moving beyond isolated sentences to handle long-context inputs, diverse content types, and various formats. Sarvam-Translate aims to provide high-quality, contextually aware translations for Indian languages, which have traditionally lagged behind high-resource languages in LLM performance.",
"paper_link": "https://www.sarvam.ai/blogs/sarvam-translate",
"colab_link": null,
"website_link": "https://dashboard.sarvam.ai/translate",
"github_link": null,
"service_id": null,
"hf_link": "https://huggingface.co/sarvamai/sarvam-translate",
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
},
{
"id": 26,
"title": "VocabAdaptation LLM",
"area": "NLP",
"published_on": "2024-07-12",
"conference": "CoNLL",
"description": "VocabAdaptation_LLM is an open-source toolkit from AI4Bharat designed to expand the vocabulary of large language models (LLMs) to support new languages, especially Indic languages. It provides scripts and pipelines for training new tokenizers, merging them with existing LLM tokenizers, and initializing embeddings for newly added vocabulary—enabling efficient adaptation and evaluation of LLMs in multilingual settings. This toolkit helps scale language model support for regions with diverse linguistic needs.",
"paper_link": "https://aclanthology.org/2024.conll-1.8.pdf",
"colab_link": null,
"website_link": null,
"github_link": "https://github.com/AI4Bharat/VocabAdaptation_LLM",
"service_id": null,
"hf_link": null,
"installation_steps_json": null,
"usage_steps_json": null,
"testimonials_json": null,
"latest": false,
"paper_award": null,
"license": []
}
]