dc.contributor.author | Gurevych, Iryna |
dc.contributor.author | Habernal, Ivan |
dc.contributor.author | Zayed, Omnia |
dc.date.accessioned | 2017-06-07T13:07:31Z |
dc.date.available | 2017-06-07T13:07:31Z |
dc.date.issued | 2016-04-14 |
dc.identifier.uri | http://hdl.handle.net/11372/LRT-2205 |
dc.description | A large web corpus (over 10 billion tokens) licensed under CreativeCommons license family in 50+ languages that has been extracted from CommonCrawl, the largest publicly available general Web crawl to date with about 2 billion crawled URLs. |
dc.language.iso | afr |
dc.language.iso | ara |
dc.language.iso | ben |
dc.language.iso | bul |
dc.language.iso | ces |
dc.language.iso | dan |
dc.language.iso | deu |
dc.language.iso | ell |
dc.language.iso | eng |
dc.language.iso | est |
dc.language.iso | fas |
dc.language.iso | fin |
dc.language.iso | fra |
dc.language.iso | guj |
dc.language.iso | heb |
dc.language.iso | hin |
dc.language.iso | hrv |
dc.language.iso | hun |
dc.language.iso | ind |
dc.language.iso | ita |
dc.language.iso | jpn |
dc.language.iso | kan |
dc.language.iso | kor |
dc.language.iso | lav |
dc.language.iso | lit |
dc.language.iso | mal |
dc.language.iso | mar |
dc.language.iso | mkd |
dc.language.iso | nep |
dc.language.iso | nld |
dc.language.iso | nor |
dc.language.iso | pol |
dc.language.iso | por |
dc.language.iso | ron |
dc.language.iso | rus |
dc.language.iso | slk |
dc.language.iso | slv |
dc.language.iso | som |
dc.language.iso | spa |
dc.language.iso | sqi |
dc.language.iso | swa |
dc.language.iso | swe |
dc.language.iso | tam |
dc.language.iso | tel |
dc.language.iso | tgl |
dc.language.iso | tha |
dc.language.iso | tur |
dc.language.iso | ukr |
dc.language.iso | und |
dc.language.iso | urd |
dc.language.iso | vie |
dc.language.iso | zho |
dc.publisher | Technische Universität Darmstadt |
dc.relation.isreferencedby | http://www.lrec-conf.org/proceedings/lrec2016/pdf/388_Paper.pdf |
dc.rights | Creative Commons - Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) |
dc.rights.uri | http://creativecommons.org/licenses/by-nc-nd/4.0/ |
dc.source.uri | https://dkpro.github.io/dkpro-c4corpus/ |
dc.subject | CommonCrawl |
dc.subject | Creative Commons |
dc.subject | Web corpus |
dc.subject | Amazon Web Services |
dc.title | C4Corpus (CC BY-NC-ND part) |
dc.type | corpus |
metashare.ResourceInfo#ContentInfo.mediaType | text |
dc.rights.label | PUB |
has.files | yes |
branding | LRT + Open Submissions |
contact.person | Ivan Habernal habernal@ukp.informatik.tu-darmstadt.de Technische Universität Darmstadt |
sponsor | German Research Foundation (DFG) DIP DA 1600/1-1 Information Consolidation: A New Paradigm in Knowledge Search nationalFunds |
sponsor | Amazon Amazon Web Services in Education Grant Web Services in Education Grant Other |
size.info | 10000000000 tokens |
files.size | 4505766924 |
files.count | 56 |
Soubory tohoto záznamu
Licenční kategorie:
Licence: Creative Commons - Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)
Publicly Available
Licence: Creative Commons - Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)
- Název
- Lic_by-nc-nd_Lang_af_NoBoilerplate_true_MinHtml_true-r-00009.seg-00000.warc.gz
- Velikost
- 36.86 KB
- Formát
- application/x-gzip
- MD5
- 69576a3e19707595aa71930b605f4627
- Název
- Lic_by-nc-nd_Lang_ar_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 4.74 MB
- Formát
- application/x-gzip
- MD5
- 6c02fdcceb78ef9e122886236866f138
- Název
- Lic_by-nc-nd_Lang_bg_NoBoilerplate_true_MinHtml_true-r-00010.seg-00000.warc.gz
- Velikost
- 4.72 MB
- Formát
- application/x-gzip
- MD5
- ef86643e43206b0f4f6ee29df6054171
- Název
- Lic_by-nc-nd_Lang_bn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 389.08 KB
- Formát
- application/x-gzip
- MD5
- e61bd49a4eb0869e3d9b4b4f0de54c2b
- Název
- Lic_by-nc-nd_Lang_cs_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Velikost
- 3.39 MB
- Formát
- application/x-gzip
- MD5
- bf3bd2ef9709f65099fca54c20048830
- Název
- Lic_by-nc-nd_Lang_da_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 1.7 MB
- Formát
- application/x-gzip
- MD5
- 543fc4c405cf9c2ecb2c5f2a09dcd0d7
- Název
- Lic_by-nc-nd_Lang_de_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Velikost
- 97 MB
- Formát
- application/x-gzip
- MD5
- 69bba32a551bf406157fa45776063cb9
- Název
- Lic_by-nc-nd_Lang_el_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 4.59 MB
- Formát
- application/x-gzip
- MD5
- fc64bc4d976fdb11fc9b925aa48b68e2
- Název
- Lic_by-nc-nd_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 953.7 MB
- Formát
- application/x-gzip
- MD5
- 712b703c52e6dc4f35e86d319baae75f
- Název
- Lic_by-nc-nd_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00001.warc.gz
- Velikost
- 953.72 MB
- Formát
- application/x-gzip
- MD5
- 8d7e5b1137285d1fb32f292ec56aa6ef
- Název
- Lic_by-nc-nd_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00002.warc.gz
- Velikost
- 953.7 MB
- Formát
- application/x-gzip
- MD5
- 0a6ba521fd9e01552734b8ab45560c48
- Název
- Lic_by-nc-nd_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00003.warc.gz
- Velikost
- 422.15 MB
- Formát
- application/x-gzip
- MD5
- d0bc2bec0bd2c21a54e4ab0866ec1820
- Název
- Lic_by-nc-nd_Lang_es_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Velikost
- 434.93 MB
- Formát
- application/x-gzip
- MD5
- 9dccc9d83345aa23609153b014926f22
- Název
- Lic_by-nc-nd_Lang_et_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 962.27 KB
- Formát
- application/x-gzip
- MD5
- f4b1f78285e8d86845fd5fcba44809ba
- Název
- Lic_by-nc-nd_Lang_fa_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 1.24 MB
- Formát
- application/x-gzip
- MD5
- 0fe1c018920afce4a44c32399dcaf839
- Název
- Lic_by-nc-nd_Lang_fi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Velikost
- 886.21 KB
- Formát
- application/x-gzip
- MD5
- 6e0ada40860fec5077fd9117a4c1da1f
- Název
- Lic_by-nc-nd_Lang_fr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 83.28 MB
- Formát
- application/x-gzip
- MD5
- aad91d581ff4dfe4aa699a01c4dd27c5
- Název
- Lic_by-nc-nd_Lang_gu_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Velikost
- 170.27 KB
- Formát
- application/x-gzip
- MD5
- 3344cbbeee8499f071f40fc1a4650ee9
- Název
- Lic_by-nc-nd_Lang_he_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Velikost
- 923.04 KB
- Formát
- application/x-gzip
- MD5
- ff765f9fc47e3f3b1729ebb8aa703d78
- Název
- Lic_by-nc-nd_Lang_hi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Velikost
- 1.58 MB
- Formát
- application/x-gzip
- MD5
- 7f0c265afa29786ac6470d211a50b6e3
- Název
- Lic_by-nc-nd_Lang_hr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 15.25 MB
- Formát
- application/x-gzip
- MD5
- c98433037a24fce60e9b8b272861850a
- Název
- Lic_by-nc-nd_Lang_hu_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Velikost
- 8.22 MB
- Formát
- application/x-gzip
- MD5
- c9cef762687f8d9996f3bae4e65e723f
- Název
- Lic_by-nc-nd_Lang_id_NoBoilerplate_true_MinHtml_true-r-00007.seg-00000.warc.gz
- Velikost
- 8.34 MB
- Formát
- application/x-gzip
- MD5
- 7b415acc3d3eb7a64ffbe3a932280141
- Název
- Lic_by-nc-nd_Lang_it_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 146.19 MB
- Formát
- application/x-gzip
- MD5
- 396277746745a2519d5528f5745de8d7
- Název
- Lic_by-nc-nd_Lang_ja_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 1.04 MB
- Formát
- application/x-gzip
- MD5
- c3c2d94ce2d10dc663b31a0331cbddc1
- Název
- Lic_by-nc-nd_Lang_kn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 23.46 KB
- Formát
- application/x-gzip
- MD5
- 996bcfb730f933c0eea50911437c1324
- Název
- Lic_by-nc-nd_Lang_ko_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 10.87 MB
- Formát
- application/x-gzip
- MD5
- f5d7063fdf6d8afdac64111840b622db
- Název
- Lic_by-nc-nd_Lang_lt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 309.45 KB
- Formát
- application/x-gzip
- MD5
- e6ac5906d36e99a391eb2a3283c8c2e9
- Název
- Lic_by-nc-nd_Lang_lv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Velikost
- 504.83 KB
- Formát
- application/x-gzip
- MD5
- 63a85d3ce224b9ba39ce417726121e22
- Název
- Lic_by-nc-nd_Lang_mk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Velikost
- 631.48 KB
- Formát
- application/x-gzip
- MD5
- 23de489f81917f2fbb2196c3a2b6613f
- Název
- Lic_by-nc-nd_Lang_ml_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 240.73 KB
- Formát
- application/x-gzip
- MD5
- aab00f383f979fe3b63630c8b4fb5f24
- Název
- Lic_by-nc-nd_Lang_mr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 116.49 KB
- Formát
- application/x-gzip
- MD5
- e7330c41903b5426be128824c6315222
- Název
- Lic_by-nc-nd_Lang_ne_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Velikost
- 7.98 KB
- Formát
- application/x-gzip
- MD5
- 921fb655b00dc97b7c2e778640470f4c
- Název
- Lic_by-nc-nd_Lang_nl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 9.12 MB
- Formát
- application/x-gzip
- MD5
- cc980a0a114ba92bae0dc7fcfac2021e
- Název
- Lic_by-nc-nd_Lang_no_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 604.14 KB
- Formát
- application/x-gzip
- MD5
- 01d3e8243c339e306d42dcaaf45588f0
- Název
- Lic_by-nc-nd_Lang_pl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 3.19 MB
- Formát
- application/x-gzip
- MD5
- 855da3c770120baf0ae00f889f9235d3
- Název
- Lic_by-nc-nd_Lang_pt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 94.81 MB
- Formát
- application/x-gzip
- MD5
- 8905485e48c60409506ee6551de02a98
- Název
- Lic_by-nc-nd_Lang_ro_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 7.01 MB
- Formát
- application/x-gzip
- MD5
- 12c5eeee8ad9de164b2b3b2b533d0f05
- Název
- Lic_by-nc-nd_Lang_ru_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Velikost
- 2.5 MB
- Formát
- application/x-gzip
- MD5
- 07c4909f600ae1d2354dcec1eb26d263
- Název
- Lic_by-nc-nd_Lang_sk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Velikost
- 508.3 KB
- Formát
- application/x-gzip
- MD5
- 16bf8e286e72d012bbbb896b94873bdb
- Název
- Lic_by-nc-nd_Lang_sl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 843.34 KB
- Formát
- application/x-gzip
- MD5
- 806b6b8d4086dae12d6f90b6f0ac68e0
- Název
- Lic_by-nc-nd_Lang_so_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 180.85 KB
- Formát
- application/x-gzip
- MD5
- 083a395b26194c8ea058b7b4d701208d
- Název
- Lic_by-nc-nd_Lang_sq_NoBoilerplate_true_MinHtml_true-r-00020.seg-00000.warc.gz
- Velikost
- 272.04 KB
- Formát
- application/x-gzip
- MD5
- 9cdaeee4e1e76b23fc26f814646f0b7a
- Název
- Lic_by-nc-nd_Lang_sv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Velikost
- 1.31 MB
- Formát
- application/x-gzip
- MD5
- e3d5f60275df7f59cfa761d2e1f7334b
- Název
- Lic_by-nc-nd_Lang_sw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Velikost
- 62.72 KB
- Formát
- application/x-gzip
- MD5
- a34ba1d8320bf614bf92c813fa66d686
- Název
- Lic_by-nc-nd_Lang_ta_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 1.53 MB
- Formát
- application/x-gzip
- MD5
- 5a023eddd92c402d127217bd3075d824
- Název
- Lic_by-nc-nd_Lang_te_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Velikost
- 52.85 KB
- Formát
- application/x-gzip
- MD5
- ef3ef717ef84f738f9b323191623afae
- Název
- Lic_by-nc-nd_Lang_th_NoBoilerplate_true_MinHtml_true-r-00011.seg-00000.warc.gz
- Velikost
- 7.77 MB
- Formát
- application/x-gzip
- MD5
- 3ffda6791e169566ba9cc051ccb2c4ca
- Název
- Lic_by-nc-nd_Lang_tl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 779.85 KB
- Formát
- application/x-gzip
- MD5
- 76dea75971e43d32b1e032577e34cf62
- Název
- Lic_by-nc-nd_Lang_tr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 7.76 MB
- Formát
- application/x-gzip
- MD5
- d79ea300a3eacd2b5fbd99563e00f5cc
- Název
- Lic_by-nc-nd_Lang_uk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Velikost
- 394.12 KB
- Formát
- application/x-gzip
- MD5
- a911fa929596527e2e4b66da4a0bdf5f
- Název
- Lic_by-nc-nd_Lang_unknown_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 34.68 MB
- Formát
- application/x-gzip
- MD5
- eac29c8586b6514bd89e42f8e2588558
- Název
- Lic_by-nc-nd_Lang_ur_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 78.57 KB
- Formát
- application/x-gzip
- MD5
- 421b1ff78fe344087ca0a796d6fd5393
- Název
- Lic_by-nc-nd_Lang_vi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Velikost
- 7 MB
- Formát
- application/x-gzip
- MD5
- 242f1bed62fcfcf07ac0a198cc678e8e
- Název
- Lic_by-nc-nd_Lang_zh-cn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 607.65 KB
- Formát
- application/x-gzip
- MD5
- c1d05bf9c964cd34a975c781c03459de
- Název
- Lic_by-nc-nd_Lang_zh-tw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Velikost
- 630.03 KB
- Formát
- application/x-gzip
- MD5
- cf84cdad88541b4df8b4a38639951862