dc.contributor.author | Gurevych, Iryna |
dc.contributor.author | Habernal, Ivan |
dc.contributor.author | Zayed, Omnia |
dc.date.accessioned | 2017-06-07T13:08:55Z |
dc.date.available | 2017-06-07T13:08:55Z |
dc.date.issued | 2016-04-14 |
dc.identifier.uri | http://hdl.handle.net/11372/LRT-2207 |
dc.description | A large web corpus (over 10 billion tokens) licensed under CreativeCommons license family in 50+ languages that has been extracted from CommonCrawl, the largest publicly available general Web crawl to date with about 2 billion crawled URLs. |
dc.language.iso | afr |
dc.language.iso | ara |
dc.language.iso | ben |
dc.language.iso | bul |
dc.language.iso | ces |
dc.language.iso | dan |
dc.language.iso | deu |
dc.language.iso | ell |
dc.language.iso | eng |
dc.language.iso | est |
dc.language.iso | fas |
dc.language.iso | fin |
dc.language.iso | fra |
dc.language.iso | guj |
dc.language.iso | heb |
dc.language.iso | hin |
dc.language.iso | hrv |
dc.language.iso | hun |
dc.language.iso | ind |
dc.language.iso | ita |
dc.language.iso | jpn |
dc.language.iso | kor |
dc.language.iso | lav |
dc.language.iso | lit |
dc.language.iso | mal |
dc.language.iso | mkd |
dc.language.iso | nld |
dc.language.iso | nor |
dc.language.iso | pol |
dc.language.iso | por |
dc.language.iso | ron |
dc.language.iso | rus |
dc.language.iso | slk |
dc.language.iso | slv |
dc.language.iso | som |
dc.language.iso | spa |
dc.language.iso | sqi |
dc.language.iso | swa |
dc.language.iso | swe |
dc.language.iso | tam |
dc.language.iso | tgl |
dc.language.iso | tha |
dc.language.iso | tur |
dc.language.iso | ukr |
dc.language.iso | und |
dc.language.iso | vie |
dc.language.iso | zho |
dc.publisher | Technische Universität Darmstadt |
dc.relation.isreferencedby | http://www.lrec-conf.org/proceedings/lrec2016/pdf/388_Paper.pdf |
dc.rights | Creative Commons - Attribution-NoDerivatives 4.0 International (CC BY-ND 4.0) |
dc.rights.uri | http://creativecommons.org/licenses/by-nc/4.0/ |
dc.source.uri | https://dkpro.github.io/dkpro-c4corpus/ |
dc.subject | CommonCrawl |
dc.subject | Creative Commons |
dc.subject | Web corpus |
dc.subject | Amazon Web Services |
dc.title | C4Corpus (CC BY-ND part) |
dc.type | corpus |
metashare.ResourceInfo#ContentInfo.mediaType | text |
dc.rights.label | PUB |
has.files | yes |
branding | LRT + Open Submissions |
contact.person | Ivan Habernal habernal@ukp.informatik.tu-darmstadt.de Technische Universität Darmstadt |
sponsor | German Research Foundation (DFG) DIP DA 1600/1-1 Information Consolidation: A New Paradigm in Knowledge Search nationalFunds |
sponsor | Amazon Amazon Web Services in Education Grant Web Services in Education Grant Other |
size.info | 10000000000 tokens |
files.size | 804378416 |
files.count | 48 |
Files in this item
Download all files in item (767.12 MB)This item is
Creative Commons - Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Publicly Available
and licensed under:Creative Commons - Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
- Name
- Lic_by-nd_Lang_af_NoBoilerplate_true_MinHtml_true-r-00009.seg-00000.warc.gz
- Size
- 70.82 KB
- Format
- application/x-gzip
- MD5
- 6dd176d148a51be2952333b4307c28da
- Name
- Lic_by-nd_Lang_ar_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 212.67 KB
- Format
- application/x-gzip
- MD5
- 90e05292f00fbffa63f98d29a1f8106c
- Name
- Lic_by-nd_Lang_bg_NoBoilerplate_true_MinHtml_true-r-00010.seg-00000.warc.gz
- Size
- 352.63 KB
- Format
- application/x-gzip
- MD5
- 12867c099d4f49a261458c82eaa83b25
- Name
- Lic_by-nd_Lang_bn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 60.64 KB
- Format
- application/x-gzip
- MD5
- e2c4697597345ad96dbf5ebae002e932
- Name
- Lic_by-nd_Lang_cs_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Size
- 355.32 KB
- Format
- application/x-gzip
- MD5
- f92b4767fc29883ac7da5bfed91de3e2
- Name
- Lic_by-nd_Lang_da_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 99.78 KB
- Format
- application/x-gzip
- MD5
- da7a05a621c139fbd11b08ddfc16269d
- Name
- Lic_by-nd_Lang_de_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Size
- 7.21 MB
- Format
- application/x-gzip
- MD5
- 5675b72370d943026a5e6be14b92af3c
- Name
- Lic_by-nd_Lang_el_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 796.64 KB
- Format
- application/x-gzip
- MD5
- 7b059ba062c1fb2287abf51e98c9b1a1
- Name
- Lic_by-nd_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 661.23 MB
- Format
- application/x-gzip
- MD5
- 6db9cf0a25e3f04ecc85a07fd85d55f5
- Name
- Lic_by-nd_Lang_es_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Size
- 43.64 MB
- Format
- application/x-gzip
- MD5
- 7d1caeb8da2e600438cc49dbd52f284c
- Name
- Lic_by-nd_Lang_et_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 20.28 KB
- Format
- application/x-gzip
- MD5
- a2d74d9f27cd49a6762d3c07c1d7de40
- Name
- Lic_by-nd_Lang_fa_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 98.16 KB
- Format
- application/x-gzip
- MD5
- b87e118af4dbaa170b8caf9d7b03c5b9
- Name
- Lic_by-nd_Lang_fi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Size
- 34.75 KB
- Format
- application/x-gzip
- MD5
- 17f4d007f8e08b81a21e1be4e48a5850
- Name
- Lic_by-nd_Lang_fr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 5.36 MB
- Format
- application/x-gzip
- MD5
- b608cba4a3f9d8ef55358bdad06a48f2
- Name
- Lic_by-nd_Lang_gu_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Size
- 32.55 KB
- Format
- application/x-gzip
- MD5
- c4cc0f75e08a9d3f7ab1264cfa90ff4a
- Name
- Lic_by-nd_Lang_he_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Size
- 1.49 MB
- Format
- application/x-gzip
- MD5
- bb1d80224d07048e1f23fd14ed6d950d
- Name
- Lic_by-nd_Lang_hi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Size
- 29.78 KB
- Format
- application/x-gzip
- MD5
- 4fabf5afb3d966fe7fc0413b71ae9344
- Name
- Lic_by-nd_Lang_hr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 3.39 MB
- Format
- application/x-gzip
- MD5
- ffb15e66726c448742d26cf242734d29
- Name
- Lic_by-nd_Lang_hu_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Size
- 279.43 KB
- Format
- application/x-gzip
- MD5
- 40149f8a90e81ceca917073bc634387e
- Name
- Lic_by-nd_Lang_id_NoBoilerplate_true_MinHtml_true-r-00007.seg-00000.warc.gz
- Size
- 1.62 MB
- Format
- application/x-gzip
- MD5
- 082147b8798644068ac2b8d95ac23c73
- Name
- Lic_by-nd_Lang_it_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 9.68 MB
- Format
- application/x-gzip
- MD5
- 7f8a44533dded194db1da16ac282120b
- Name
- Lic_by-nd_Lang_ja_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 672.04 KB
- Format
- application/x-gzip
- MD5
- 93cc495ca87b85eb66676f766ad26a1a
- Name
- Lic_by-nd_Lang_ko_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 275.64 KB
- Format
- application/x-gzip
- MD5
- 6502058cf6e85f23d66b98a724baffe3
- Name
- Lic_by-nd_Lang_lt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 84.64 KB
- Format
- application/x-gzip
- MD5
- e4022e3a238a0f07e0b7885bc05fe804
- Name
- Lic_by-nd_Lang_lv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Size
- 69.83 KB
- Format
- application/x-gzip
- MD5
- 56418bf2e0e2434e6fb92033938027e4
- Name
- Lic_by-nd_Lang_mk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Size
- 185.17 KB
- Format
- application/x-gzip
- MD5
- 41dd5207d2a55a905470fec11c968230
- Name
- Lic_by-nd_Lang_ml_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 99.83 KB
- Format
- application/x-gzip
- MD5
- c099bdb00557f501571b6ccd3fd44585
- Name
- Lic_by-nd_Lang_nl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 1.14 MB
- Format
- application/x-gzip
- MD5
- 630fcd0ab89a3c9e596da7fe9a203be4
- Name
- Lic_by-nd_Lang_no_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 186.9 KB
- Format
- application/x-gzip
- MD5
- 41bf869c2ffcb36c4ef8827fb9ffa47a
- Name
- Lic_by-nd_Lang_pl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 888.55 KB
- Format
- application/x-gzip
- MD5
- 5c24637505d3a19be1748dbe16b54849
- Name
- Lic_by-nd_Lang_pt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 20.42 MB
- Format
- application/x-gzip
- MD5
- 44145959e8a82f0cc1b301c84730ecd0
- Name
- Lic_by-nd_Lang_ro_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 350.45 KB
- Format
- application/x-gzip
- MD5
- c252f38c3a265c7ec68dca29cc9794c1
- Name
- Lic_by-nd_Lang_ru_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Size
- 1.86 MB
- Format
- application/x-gzip
- MD5
- 0a984e0118bfd67f22dd34c679aa85e2
- Name
- Lic_by-nd_Lang_sk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Size
- 156.72 KB
- Format
- application/x-gzip
- MD5
- 78aa44bbb8766a5b41871eb080c2839c
- Name
- Lic_by-nd_Lang_sl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 157.94 KB
- Format
- application/x-gzip
- MD5
- 083c8774054f5eb042ed42d8a8c5edf9
- Name
- Lic_by-nd_Lang_so_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 6.99 KB
- Format
- application/x-gzip
- MD5
- 30e23874e56e9f833063ec7f54517235
- Name
- Lic_by-nd_Lang_sq_NoBoilerplate_true_MinHtml_true-r-00020.seg-00000.warc.gz
- Size
- 202.56 KB
- Format
- application/x-gzip
- MD5
- bf04172d902372bc30fe154132b9713c
- Name
- Lic_by-nd_Lang_sv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Size
- 614.46 KB
- Format
- application/x-gzip
- MD5
- bcc85a78d642c42ee3a4401f2fe718bc
- Name
- Lic_by-nd_Lang_sw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Size
- 1.22 KB
- Format
- application/x-gzip
- MD5
- 89c7fc00ce8b7768784397c228aeb95d
- Name
- Lic_by-nd_Lang_ta_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 111.7 KB
- Format
- application/x-gzip
- MD5
- 1839105dbf717e1038ede8d9c9f225ef
- Name
- Lic_by-nd_Lang_th_NoBoilerplate_true_MinHtml_true-r-00011.seg-00000.warc.gz
- Size
- 66.46 KB
- Format
- application/x-gzip
- MD5
- 55ba135e7e59755cfabf87832e868b90
- Name
- Lic_by-nd_Lang_tl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 44.42 KB
- Format
- application/x-gzip
- MD5
- ce46d8b2b91e4d860fdc5b57f92629e2
- Name
- Lic_by-nd_Lang_tr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 245.71 KB
- Format
- application/x-gzip
- MD5
- 6058a475efc3c593c11cf33f5d184503
- Name
- Lic_by-nd_Lang_uk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Size
- 198.51 KB
- Format
- application/x-gzip
- MD5
- 0a658291dcce731ddbefc60cfed9cf15
- Name
- Lic_by-nd_Lang_unknown_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 2.85 MB
- Format
- application/x-gzip
- MD5
- 8286404ccbc2978ed390a044c6273da9
- Name
- Lic_by-nd_Lang_vi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Size
- 170.35 KB
- Format
- application/x-gzip
- MD5
- e310581df324aff209811162408d8a68
- Name
- Lic_by-nd_Lang_zh-cn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 96.9 KB
- Format
- application/x-gzip
- MD5
- 2bc05b9a5a2bbb983a12984f0605e7af
- Name
- Lic_by-nd_Lang_zh-tw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Size
- 74.62 KB
- Format
- application/x-gzip
- MD5
- a9382623cbbf025af0c796ba9b96105b