dc.contributor.author | Majliš, Martin |
dc.date.accessioned | 2013-06-25T13:21:15Z |
dc.date.available | 2013-06-25T13:21:15Z |
dc.date.issued | 2011-12-20 |
dc.identifier.uri | http://hdl.handle.net/11858/00-097C-0000-0022-60D6-1 |
dc.description | A tool used to build multilingual corpora from wikipedia. Download the web pages, convert them to plain text, identify language, etc. A set of 120 corpora collected using this tool is available at https://ufal-point.mff.cuni.cz/xmlui/handle/11858/00-097C-0000-0022-6133-9 |
dc.publisher | Charles University, Faculty of Mathematics and Physics, Institute of Formal and Applied Linguistics (UFAL) |
dc.relation.uri | http://hdl.handle.net/11858/00-097C-0000-0022-6133-9 |
dc.rights | Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0) |
dc.rights.uri | http://creativecommons.org/licenses/by-sa/3.0/ |
dc.subject | web data |
dc.subject | wikipedia |
dc.subject | corpus creation |
dc.title | W2C – Web to Corpus – tool |
dc.type | toolService |
metashare.ResourceInfo#ContactInfo#PersonInfo.surname | Popel |
metashare.ResourceInfo#ContactInfo#PersonInfo.givenName | Martin |
metashare.ResourceInfo#ContactInfo#PersonInfo#OrganizationInfo.organizationName | Charles University in Prague, UFAL |
metashare.ResourceInfo#DistributionInfo.availability | restrictedUse |
metashare.ResourceInfo#DistributionInfo#LicenseInfo.restrictionsOfUse | attribution |
metashare.ResourceInfo#DistributionInfo#LicenseInfo.restrictionsOfUse | shareAlike |
metashare.ResourceInfo#DistributionInfo#LicenseInfo.distributionAccessMedium | downloadable |
metashare.ResourceInfo#ValidationInfo.validated | True |
metashare.ResourceInfo#ContactInfo#PersonInfo#OrganizationInfo#CommunicationInfo.email | popel@ufal.mff.cuni.cz |
metashare.ResourceInfo#ResourceComponentType#ToolServiceInfo.languageDependent | false |
metashare.ResourceInfo#ContentInfo.detailedType | suiteOfTools |
dc.rights.label | PUB |
has.files | yes |
branding | LINDAT / CLARIAH-CZ |
files.size | 750549 |
files.count | 2 |
Files in this item
Download all files in item (732.96 KB)This item is
Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)
Publicly Available
and licensed under:Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)
- Name
- w2c.tar.gz
- Size
- 165.85 KB
- Format
- application/x-gzip
- Description
- W2C toolset source code
- MD5
- 747d9fabca38d085e976950193029ca3
- w2c-src
- pipes
- vertical.sh476 B
- Makefile52 B
- strip-html-tags.php423 B
- wiki-remove-interwiki.sh409 B
- checkRequirements.sh149 B
- frequencyList.sh384 B
- ngrams.pl1 kB
- wordLengths.pl793 B
- textStats.pl7 kB
- Makefile1 kB
- tests
- config.sh783 B
- boilerpipe
- Makefile1 kB
- Makefile112 B
- wac
- Makefile2 kB
- create-url-list.sh648 B
- langDetect
- Makefile17 B
- test.sh1 kB
- webAPI
- Makefile60 B
- test.sh1 kB
- README396 B
- checkRequirements.sh2 kB
- scripts
- lab
- copy-results-to-single-node.sh2 kB
- web-mini-corpora-lab.sh1 kB
- delete-created-files.sh425 B
- process-file-wrapper.sh1006 B
- copy-to-ufallab.sh346 B
- process-results-fix-wrapper.sh2 kB
- missing-languages.sh565 B
- process-file.sh1 kB
- info-words.sh633 B
- process-results-fix.sh7 kB
- fill-corpora-quotas-wrapper.sh415 B
- process-results.sh9 kB
- build-package-wrapper.sh1 kB
- process-results-wrapper.sh2 kB
- process-results-overview.sh14 kB
- run-serial.sh495 B
- run-parallel.sh467 B
- build-package.sh5 kB
- fill-corpora-quotas.sh2 kB
- download-wikipedias.sh1 kB
- ufallab
- config.sh321 B
- copy-files-from-webhosting.sh980 B
- merge-results.sh4 kB
- generate-stats.sh15 kB
- extract-data.sh535 B
- lab
- data
- tools
- ethnologueParser
- extractInfo.pl2 kB
- ethnologueParser.sh1 kB
- fillDB.sh1 kB
- splitter
- splitter.pl2 kB
- splitter.sh247 B
- webAPI
- webAPI.sh2 kB
- crawlerSimple
- Makefile61 B
- crawlerSimple.sh4 kB
- internetSize
- internetSize.sh6 kB
- corpusAnalysis
- generateCAHTML.sh1007 B
- frequencyList.pl1 kB
- config.sh206 B
- corpusAnalysis.sh969 B
- grep
- checkRequirements.sh292 B
- Makefile318 B
- search
- google.pl2 kB
- ua.txt1 kB
- langDetect
- train.pl5 kB
- filter.pl2 kB
- config.sh109 B
- createLanguageModel.sh3 kB
- train.sh194 B
- detector.pl2 kB
- detect.pl3 kB
- detect.sh196 B
- eval.pl7 kB
- eval.sh192 B
- convert.pl1 kB
- wikiCorpora
- wikiCorpora.sh2 kB
- wikiCorpora.pl602 B
- aspellCoverage
- aspellCoverage.sh3 kB
- README208 B
- massExecute
- massExecute.sh237 B
- massExecute.pl1 kB
- Makefile356 B
- langList
- config.sh154 B
- Makefile47 B
- langList.sh201 B
- langInfo.sh2 kB
- langList.pl5 kB
- README55 B
- languages.2010-12-12.all161 kB
- languages.2010-12-12.wiki14 kB
- fillLangDB
- fillFiles01.sh26 kB
- fillLanguageInfo.sh710 B
- Makefile108 B
- fillAliases.sh1 kB
- fillAspellDictionaries.sh1 kB
- utils
- mergeAllStatistics.sh7 kB
- cleanFile.sh1 kB
- xpath
- xpath.sh94 B
- xpath.pl663 B
- normalize.pl1 kB
- processFile.sh2 kB
- processFiles.sh2 kB
- expandSearch.pl1 kB
- config.sh2 kB
- wikiExternalLinks
- wikiExternalLinks.sh2 kB
- wikiMiniCorpora
- Makefile183 B
- wikiMiniCorpora.sh2 kB
- ethnologueParser
- builder
- html2text.pl795 B
- parser.pl6 kB
- links.txt349 B
- regenerateTexts.sh2 kB
- create-corpora.pl45 kB
- log-analyzer.pl7 kB
- tidy.sh339 B
- links.small.txt43 B
- Makefile1 kB
- create-corpora.sh3 kB
- README1 kB
- packData.sh2 kB
- charter.sh8 kB
- Common.pm17 kB
- config.xml489 B
- extract-results.sh361 B
- controller.pl5 kB
- keeper.sh3 kB
- crawler.pl9 kB
- create-corpora-local.sh403 B
- detector.pl7 kB
- lib
- WikiParser.pm3 kB
- LanguageModel.pm652 B
- Makefile15 B
- t
- LangDetect.t587 B
- Utils.pm149 B
- LangDetect.pm538 B
- experiments
- aspell-coverage-01.sh6 kB
- word-lengths-01.sh4 kB
- lang-stats-01.sh4 kB
- web-mini-corpora.sh562 B
- wiki-vs-internet-size-01.sh7 kB
- visualizations
- splitter.pl536 B
- transpose.sh411 B
- table.pl5 kB
- dataStats.sh165 B
- dataStats.pl784 B
- dataCombiner.pl1 kB
- listDiffScore.pl1 kB
- plot.sh848 B
- bin
- wikiExternalLinks.sh0 B
- normalize.pl0 B
- splitter.pl0 B
- crawlerSimple.sh0 B
- langList.sh0 B
- xpath.sh0 B
- wikiCorpora.sh0 B
- corpusAnalysis.sh0 B
- aspellCoverage.sh0 B
- wikiMiniCorpora.sh0 B
- langInfo.sh0 B
- webAPI.sh0 B
- generateCAHTML.sh0 B
- pipes
- Name
- tr46.pdf
- Size
- 567.11 KB
- Format
- Description
- Technical Report - documentation
- MD5
- 824ef862d75b40fc324d54b13a592ee1